From 3dd6853bc8c4fb8bbaf507c1699e5cbe8fa356ad Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 12 Jun 2024 12:58:02 -0400
Subject: [PATCH 001/376] [CI/Build] Add `is_quant_method_supported` to control
 quantization test configurations (#5253)

---
 tests/models/test_aqlm.py               | 13 ++-----------
 tests/models/test_fp8.py                | 12 ++----------
 tests/models/test_gptq_marlin.py        | 13 ++-----------
 tests/models/test_gptq_marlin_24.py     | 13 ++-----------
 tests/models/test_marlin.py             | 13 ++-----------
 tests/quantization/test_bitsandbytes.py | 10 +++-------
 tests/quantization/test_fp8.py          | 10 +++-------
 tests/quantization/utils.py             | 14 ++++++++++++++
 8 files changed, 30 insertions(+), 68 deletions(-)
 create mode 100644 tests/quantization/utils.py

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index c4ecf846e..80034a511 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
 """
 
 import pytest
-import torch
 
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-aqlm_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    aqlm_not_supported = (capability <
-                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
+from tests.quantization.utils import is_quant_method_supported
 
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
@@ -67,7 +58,7 @@ ground_truth_generations = [
 ]
 
 
-@pytest.mark.skipif(aqlm_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                     reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 61aee0d0a..b24c17cf3 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -8,8 +8,8 @@ import pytest
 import torch
 from transformers import AutoTokenizer
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -67,16 +67,8 @@ EXPECTED_STRS_MAP = {
     },
 }
 
-fp8_not_supported = True
 
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    fp8_not_supported = (capability <
-                         QUANTIZATION_METHODS["fp8"].get_min_capability())
-
-
-@pytest.mark.skipif(fp8_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index e957450cc..e30100d9b 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
 import os
 
 import pytest
-import torch
 
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
 
 from .utils import check_logprobs_close
@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 MAX_MODEL_LEN = 1024
 
-gptq_marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    gptq_marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
-
 MODELS = [
     # act_order==False, group_size=channelwise
     ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
@@ -53,7 +44,7 @@ MODELS = [
 
 
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(gptq_marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py
index 195c3e5b5..60d9ae2f1 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
 from dataclasses import dataclass
 
 import pytest
-import torch
 
 from tests.models.utils import check_logprobs_close
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
+from tests.quantization.utils import is_quant_method_supported
 
 
 @dataclass
@@ -47,7 +38,7 @@ model_pairs = [
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                     reason="Marlin24 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 761ba6aa4..e86f6e29d 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
 from dataclasses import dataclass
 
 import pytest
-import torch
 
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
 
 from .utils import check_logprobs_close
 
-marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
-
 
 @dataclass
 class ModelPair:
@@ -45,7 +36,7 @@ model_pairs = [
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("marlin"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 31e938d15..953fd9ba9 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
 import pytest
 import torch
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm import SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
 
-
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
-    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
 def test_load_bnb_model(vllm_runner) -> None:
     with vllm_runner('huggyllama/llama-7b',
                      quantization='bitsandbytes',
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index fccce7f7b..3db12f379 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
 import pytest
 import torch
 
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
 
-
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
     with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
new file mode 100644
index 000000000..0c92d565d
--- /dev/null
+++ b/tests/quantization/utils.py
@@ -0,0 +1,14 @@
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+
+def is_quant_method_supported(quant_method: str) -> bool:
+    # Currently, all quantization methods require Nvidia or AMD GPUs
+    if not torch.cuda.is_available():
+        return False
+
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    return (capability <
+            QUANTIZATION_METHODS[quant_method].get_min_capability())
-- 
GitLab


From e3c12bf6d22999cfbe267a7c788f6875340616cd Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 12 Jun 2024 12:03:24 -0500
Subject: [PATCH 002/376] Revert "[CI/Build] Add `is_quant_method_supported` to
 control quantization test configurations" (#5463)

---
 tests/models/test_aqlm.py               | 13 +++++++++++--
 tests/models/test_fp8.py                | 12 ++++++++++--
 tests/models/test_gptq_marlin.py        | 13 +++++++++++--
 tests/models/test_gptq_marlin_24.py     | 13 +++++++++++--
 tests/models/test_marlin.py             | 13 +++++++++++--
 tests/quantization/test_bitsandbytes.py | 10 +++++++---
 tests/quantization/test_fp8.py          | 10 +++++++---
 tests/quantization/utils.py             | 14 --------------
 8 files changed, 68 insertions(+), 30 deletions(-)
 delete mode 100644 tests/quantization/utils.py

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 80034a511..c4ecf846e 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -4,8 +4,17 @@ Run `pytest tests/models/test_aqlm.py`.
 """
 
 import pytest
+import torch
 
-from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+aqlm_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    aqlm_not_supported = (capability <
+                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
 
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
@@ -58,7 +67,7 @@ ground_truth_generations = [
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
+@pytest.mark.skipif(aqlm_not_supported,
                     reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index b24c17cf3..61aee0d0a 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -8,8 +8,8 @@ import pytest
 import torch
 from transformers import AutoTokenizer
 
-from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -67,8 +67,16 @@ EXPECTED_STRS_MAP = {
     },
 }
 
+fp8_not_supported = True
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    fp8_not_supported = (capability <
+                         QUANTIZATION_METHODS["fp8"].get_min_capability())
+
+
+@pytest.mark.skipif(fp8_not_supported,
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index e30100d9b..e957450cc 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -11,8 +11,9 @@ Run `pytest tests/models/test_gptq_marlin.py`.
 import os
 
 import pytest
+import torch
 
-from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
 
 from .utils import check_logprobs_close
@@ -21,6 +22,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 MAX_MODEL_LEN = 1024
 
+gptq_marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    gptq_marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
+
 MODELS = [
     # act_order==False, group_size=channelwise
     ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
@@ -44,7 +53,7 @@ MODELS = [
 
 
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+@pytest.mark.skipif(gptq_marlin_not_supported,
                     reason="gptq_marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py
index 60d9ae2f1..195c3e5b5 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -9,9 +9,18 @@ Run `pytest tests/models/test_marlin_24.py`.
 from dataclasses import dataclass
 
 import pytest
+import torch
 
 from tests.models.utils import check_logprobs_close
-from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 
 
 @dataclass
@@ -38,7 +47,7 @@ model_pairs = [
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
+@pytest.mark.skipif(marlin_not_supported,
                     reason="Marlin24 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index e86f6e29d..761ba6aa4 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -13,11 +13,20 @@ Run `pytest tests/models/test_marlin.py`.
 from dataclasses import dataclass
 
 import pytest
+import torch
 
-from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
 from .utils import check_logprobs_close
 
+marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
+
 
 @dataclass
 class ModelPair:
@@ -36,7 +45,7 @@ model_pairs = [
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("marlin"),
+@pytest.mark.skipif(marlin_not_supported,
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 953fd9ba9..31e938d15 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -5,12 +5,16 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
 import pytest
 import torch
 
-from tests.quantization.utils import is_quant_method_supported
 from vllm import SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
+    reason='bitsandbytes is not supported on this GPU type.')
 def test_load_bnb_model(vllm_runner) -> None:
     with vllm_runner('huggyllama/llama-7b',
                      quantization='bitsandbytes',
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 3db12f379..fccce7f7b 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -5,12 +5,16 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
 import pytest
 import torch
 
-from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
     with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
deleted file mode 100644
index 0c92d565d..000000000
--- a/tests/quantization/utils.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import torch
-
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-
-def is_quant_method_supported(quant_method: str) -> bool:
-    # Currently, all quantization methods require Nvidia or AMD GPUs
-    if not torch.cuda.is_available():
-        return False
-
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    return (capability <
-            QUANTIZATION_METHODS[quant_method].get_min_capability())
-- 
GitLab


From 847cdcca1c94b12e6c118dbf863e4b111d1b4fd2 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Thu, 13 Jun 2024 02:06:14 +0900
Subject: [PATCH 003/376] [CI] Upgrade codespell version. (#5381)

---
 .github/workflows/ruff.yml                   | 2 +-
 requirements-dev.txt                         | 2 +-
 tests/core/test_chunked_prefill_scheduler.py | 2 +-
 tests/test_sharded_state_loader.py           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index e71033f82..773def58f 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -25,7 +25,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
+        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
     - name: Analysing the code with ruff
       run: |
         ruff .
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 12b22a61e..b380ef205 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -3,7 +3,7 @@ yapf==0.32.0
 toml==0.10.2
 tomli==2.0.1
 ruff==0.1.5
-codespell==2.2.6
+codespell==2.3.0
 isort==5.13.2
 clang-format==18.1.5
 
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 3649e6b00..f68482cc0 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -149,7 +149,7 @@ def test_complex():
     # Only the first seq group has a new token appended.
     append_new_token(running[0], 1)
 
-    # Add 2 more requsets.
+    # Add 2 more requests.
     for i in range(2, 4):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
         scheduler.add_seq_group(seq_group)
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index de79c3b94..f5d956904 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -39,7 +39,7 @@ def test_filter_subtensors():
     filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
     assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
     for key, tensor in filtered_state_dict.items():
-        # NOTE: don't use `euqal` here, as the tensor might contain NaNs
+        # NOTE: don't use `equal` here, as the tensor might contain NaNs
         assert tensor is state_dict[key]
 
 
-- 
GitLab


From 1a8bfd92d5f35d638e3cfc8c4cd1779aeda0adfb Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 12 Jun 2024 11:53:03 -0700
Subject: [PATCH 004/376] [Hardware] Initial TPU integration (#5292)

---
 Dockerfile.tpu                                |  19 +
 benchmarks/benchmark_latency.py               |   2 +-
 benchmarks/benchmark_throughput.py            |   2 +-
 .../getting_started/tpu-installation.rst      |  75 +++
 docs/source/index.rst                         |   3 +-
 requirements-tpu.txt                          |   7 +
 setup.py                                      |  22 +-
 vllm/attention/backends/pallas.py             | 232 ++++++++
 vllm/attention/selector.py                    |  13 +-
 vllm/config.py                                |   6 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/engine/async_llm_engine.py               |   3 +
 vllm/engine/llm_engine.py                     |   3 +
 vllm/envs.py                                  |   6 +
 vllm/executor/tpu_executor.py                 | 101 ++++
 vllm/model_executor/custom_op.py              |   4 +-
 .../model_executor/layers/rotary_embedding.py |  77 ++-
 vllm/model_executor/model_loader/loader.py    |  27 +-
 vllm/utils.py                                 |  14 +
 vllm/worker/cache_engine.py                   |   9 +-
 vllm/worker/tpu_model_runner.py               | 525 ++++++++++++++++++
 vllm/worker/tpu_worker.py                     | 198 +++++++
 22 files changed, 1322 insertions(+), 28 deletions(-)
 create mode 100644 Dockerfile.tpu
 create mode 100644 docs/source/getting_started/tpu-installation.rst
 create mode 100644 requirements-tpu.txt
 create mode 100644 vllm/attention/backends/pallas.py
 create mode 100644 vllm/executor/tpu_executor.py
 create mode 100644 vllm/worker/tpu_model_runner.py
 create mode 100644 vllm/worker/tpu_worker.py

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
new file mode 100644
index 000000000..931c844c0
--- /dev/null
+++ b/Dockerfile.tpu
@@ -0,0 +1,19 @@
+ARG NIGHTLY_DATE="20240601"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
+
+FROM $BASE_IMAGE
+
+WORKDIR /workspace
+COPY . /workspace/vllm
+
+ENV VLLM_TARGET_DEVICE="tpu"
+# Install aiohttp separately to avoid build errors.
+RUN pip install aiohttp
+# Install the TPU and Pallas dependencies.
+RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+# Build vLLM.
+RUN cd /workspace/vllm && python setup.py develop
+
+CMD ["/bin/bash"]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 1a41b66b3..17edb7515 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -189,7 +189,7 @@ if __name__ == '__main__':
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu"],
+        choices=["cuda", "cpu", "tpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument('--block-size',
                         type=int,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 90f7433e0..07b2f8541 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -346,7 +346,7 @@ if __name__ == "__main__":
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu"],
+        choices=["cuda", "cpu", "tpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument(
         "--enable-prefix-caching",
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
new file mode 100644
index 000000000..3627600e1
--- /dev/null
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -0,0 +1,75 @@
+.. _installation_tpu:
+
+Installation with TPU
+=====================
+
+vLLM supports Google Cloud TPUs using PyTorch XLA.
+
+Requirements
+------------
+
+* Google Cloud TPU VM (single host)
+* TPU versions: v5e, v5p, v4
+* Python: 3.10
+
+Installation options:
+
+1. :ref:`Build a docker image with Dockerfile <build_docker_tpu>`.
+2. :ref:`Build from source <build_from_source_tpu>`.
+
+.. _build_docker_tpu:
+
+Build a docker image with :code:`Dockerfile.tpu`
+------------------------------------------------
+
+`Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ is provided to build a docker image with TPU support.
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.tpu -t vllm-tpu .
+
+
+You can run the docker image with the following command:
+
+.. code-block:: console
+
+    $ # Make sure to add `--privileged --net host --shm-size=16G`.
+    $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+
+
+.. _build_from_source_tpu:
+
+Build from source
+-----------------
+
+You can also build and install the TPU backend from source.
+
+First, install the dependencies:
+
+.. code-block:: console
+
+    $ # (Recommended) Create a new conda environment.
+    $ conda create -n myenv python=3.10 -y
+    $ conda activate myenv
+
+    $ # Clean up the existing torch and torch-xla packages.
+    $ pip uninstall torch torch-xla -y
+
+    $ # Install PyTorch and PyTorch XLA.
+    $ export DATE="+20240601"
+    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
+    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
+
+    $ # Install JAX and Pallas.
+    $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+    $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+    $ # Install other build dependencies.
+    $ pip install packaging aiohttp
+
+
+Next, build vLLM from source. This will only take a few seconds:
+
+.. code-block:: console
+
+    $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 807251d02..b7c0d5b88 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -63,8 +63,9 @@ Documentation
 
    getting_started/installation
    getting_started/amd-installation
-   getting_started/neuron-installation
    getting_started/cpu-installation
+   getting_started/neuron-installation
+   getting_started/tpu-installation
    getting_started/quickstart
    getting_started/debugging
    getting_started/examples/examples_index
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
new file mode 100644
index 000000000..22487f552
--- /dev/null
+++ b/requirements-tpu.txt
@@ -0,0 +1,7 @@
+# Common dependencies
+-r requirements-common.txt
+
+# Dependencies for TPU
+# Currently, the TPU backend uses a nightly version of PyTorch XLA.
+# You can install the dependencies in Dockerfile.tpu.
+triton  # To avoid import errors
diff --git a/setup.py b/setup.py
index 53a697232..12e5c3456 100644
--- a/setup.py
+++ b/setup.py
@@ -206,9 +206,9 @@ class cmake_build_ext(build_ext):
 
 
 def _is_cuda() -> bool:
-    return VLLM_TARGET_DEVICE == "cuda" \
-            and torch.version.cuda is not None \
-            and not _is_neuron()
+    has_cuda = torch.version.cuda is not None
+    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
+            and not (_is_neuron() or _is_tpu()))
 
 
 def _is_hip() -> bool:
@@ -225,10 +225,18 @@ def _is_neuron() -> bool:
     return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"
 
 
+def _is_tpu() -> bool:
+    return VLLM_TARGET_DEVICE == "tpu"
+
+
 def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
+def _build_custom_ops() -> bool:
+    return _is_cuda() or _is_hip() or _is_cpu()
+
+
 def _install_punica() -> bool:
     return envs.VLLM_INSTALL_PUNICA_KERNELS
 
@@ -325,6 +333,8 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
+    elif _is_tpu():
+        version += "+tpu"
     elif _is_cpu():
         version += "+cpu"
     else:
@@ -372,6 +382,8 @@ def get_requirements() -> List[str]:
         requirements = _read_requirements("requirements-rocm.txt")
     elif _is_neuron():
         requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_tpu():
+        requirements = _read_requirements("requirements-tpu.txt")
     elif _is_cpu():
         requirements = _read_requirements("requirements-cpu.txt")
     else:
@@ -385,7 +397,7 @@ ext_modules = []
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 
-if not _is_neuron():
+if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
     if _install_punica():
@@ -428,6 +440,6 @@ setup(
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
     },
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
     package_data=package_data,
 )
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
new file mode 100644
index 000000000..b203c5ec5
--- /dev/null
+++ b/vllm/attention/backends/pallas.py
@@ -0,0 +1,232 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+import torch_xla.experimental.custom_kernel  # Required to register custom ops.
+import torch_xla.experimental.dynamo_set_buffer_donor
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata)
+
+
+class PallasAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
+        return PallasAttentionBackendImpl
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "PallasMetadata":
+        return PallasMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_kv_heads, num_blocks, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        raise NotImplementedError("swap_blocks is not implemented.")
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        # TODO(woosuk): Implement this.
+        raise NotImplementedError("copy_blocks is not implemented.")
+
+
+@dataclass
+class PallasMetadata(AttentionMetadata):
+
+    # Currently, input sequences can only contain all prefills
+    # or all decoding.
+    block_tables: Optional[torch.Tensor]
+    context_lens: Optional[torch.Tensor]
+
+    @property
+    def prefill_metadata(self) -> Optional["PallasMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        assert self.num_decode_tokens == 0
+        assert self.block_tables is None
+        assert self.context_lens is None
+        return self
+
+    @property
+    def decode_metadata(self) -> Optional["PallasMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.block_tables is not None
+        assert self.context_lens is not None
+        return self
+
+
+class PallasAttentionBackendImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        if head_size % 128 != 0:
+            raise NotImplementedError("Head size must be a multiple of 128.")
+        if alibi_slopes is not None:
+            raise NotImplementedError("Alibi slopes is not supported.")
+        if sliding_window is not None:
+            raise NotImplementedError("Sliding window is not supported.")
+        if kv_cache_dtype != "auto":
+            raise NotImplementedError("FP8 KV cache dtype is not supported.")
+        if blocksparse_params is not None:
+            raise NotImplementedError("Blocksparse is not supported.")
+
+        if torch_xla.tpu.version() < 4:
+            raise NotImplementedError("TPU version must be 4 or higher.")
+
+        self.megacore_mode = None
+        tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
+        if not tpu_type.endswith("lite"):
+            if self.num_kv_heads % 2 == 0:
+                self.megacore_mode = "kv_head"
+            else:
+                # NOTE(woosuk): If the batch size is not a multiple of 2, the
+                # megacore mode will be None.
+                self.megacore_mode = "batch"
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: Tuple[Optional[torch.Tensor], Optional[torch.Tensor]],
+        attn_metadata: PallasMetadata,
+        kv_scale: float = 1.0,
+    ) -> torch.Tensor:
+        """Forward pass with Pallas attention.
+
+        Args:
+            query: shape = [batch_size, seq_len, num_heads * head_size]
+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            key_cache = [num_kv_heads, num_blocks, block_size, head_size]
+            value_cache = [num_kv_heads, num_blocks, block_size, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [batch_size, seq_len, num_heads * head_size]
+        """
+        assert kv_scale == 1.0
+        batch_size, seq_len, hidden_size = query.shape
+        query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
+        key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
+        value = value.view(batch_size, seq_len, self.num_kv_heads,
+                           self.head_size)
+
+        if kv_cache[0] is not None:
+            slot_mapping = attn_metadata.slot_mapping
+            key_cache, value_cache = kv_cache
+            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
+
+        query = query * self.scale
+        if attn_metadata.num_prefills > 0:
+            assert seq_len % 16 == 0, (
+                "Pallas FlashAttention kernel requires seq_len to be a "
+                f"multiple of 16 but got {seq_len}")
+
+            # Handle GQA/MQA.
+            if self.num_kv_heads != self.num_heads:
+                key = key.repeat_interleave(self.num_queries_per_kv, dim=-2)
+                key = key.view(batch_size, seq_len, self.num_heads,
+                               self.head_size)
+                value = value.repeat_interleave(self.num_queries_per_kv,
+                                                dim=-2)
+                value = value.view(batch_size, seq_len, self.num_heads,
+                                   self.head_size)
+            # FlashAttention requires [batch_size, num_heads, seq_len, d_model]
+            # while the input is [batch_size, seq_len, num_heads, d_model].
+            # Permute the input to match the required format.
+            output = torch.ops.xla.flash_attention(
+                query.permute(0, 2, 1, 3),
+                key.permute(0, 2, 1, 3),
+                value.permute(0, 2, 1, 3),
+                True,
+            )
+            output = output.permute(0, 2, 1, 3)
+        else:
+            # Decoding run.
+            assert kv_cache is not None
+
+            pages_per_compute_block = 16  # TODO(woosuk): Tune this value.
+            if self.megacore_mode == "batch" and batch_size % 2 != 0:
+                megacore_mode = None
+            else:
+                megacore_mode = self.megacore_mode
+
+            # NOTE(woosuk): A temporary workaround to avoid the error:
+            # "xla::paged_attention() Expected a value of type 'str' for
+            # argument 'megacore_mode' but instead found type 'NoneType'."
+            if megacore_mode is not None:
+                output = torch.ops.xla.paged_attention(
+                    query.squeeze(dim=1),
+                    key_cache,
+                    value_cache,
+                    attn_metadata.context_lens,
+                    attn_metadata.block_tables,
+                    pages_per_compute_block,
+                    megacore_mode=megacore_mode,
+                )
+            else:
+                output = torch.ops.xla.paged_attention(
+                    query.squeeze(dim=1),
+                    key_cache,
+                    value_cache,
+                    attn_metadata.context_lens,
+                    attn_metadata.block_tables,
+                    pages_per_compute_block,
+                )
+
+        # Reshape the output tensor.
+        return output.reshape(batch_size, seq_len, hidden_size)
+
+
+def write_to_kv_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+) -> None:
+    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
+    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
+
+    key = key.flatten(0, 2)
+    value = value.flatten(0, 2)
+    key_cache = key_cache.flatten(0, 2)
+    value_cache = value_cache.flatten(0, 2)
+    key_cache.index_copy_(0, slot_mapping, key)
+    value_cache.index_copy_(0, slot_mapping, value)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 7253483f9..3f0e29c73 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -7,7 +7,7 @@ import torch
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_cpu, is_hip
+from vllm.utils import is_cpu, is_hip, is_tpu
 
 logger = init_logger(__name__)
 
@@ -18,6 +18,7 @@ class _Backend(enum.Enum):
     ROCM_FLASH = enum.auto()
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
+    PALLAS = enum.auto()
 
 
 @lru_cache(maxsize=None)
@@ -66,6 +67,10 @@ def get_attn_backend(
                        "Please make sure --enforce-eager is set.")
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
+    elif backend == _Backend.PALLAS:
+        logger.info("Using Pallas backend.")
+        from vllm.attention.backends.pallas import PallasAttentionBackend
+        return PallasAttentionBackend
     else:
         raise ValueError("Invalid attention backend.")
 
@@ -80,7 +85,6 @@ def which_attn_to_use(
     block_size: int,
 ) -> _Backend:
     """Returns which flash attention backend to use."""
-
     # Default case.
     selected_backend = _Backend.FLASH_ATTN
 
@@ -100,6 +104,11 @@ def which_attn_to_use(
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         return _Backend.TORCH_SDPA
 
+    if is_tpu():
+        if selected_backend != _Backend.PALLAS:
+            logger.info("Cannot use %s backend on TPU.", selected_backend)
+        return _Backend.PALLAS
+
     if is_hip():
         # AMD GPUs.
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
diff --git a/vllm/config.py b/vllm/config.py
index 50b0156b1..2513d43ce 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -11,7 +11,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
 from vllm.transformers_utils.config import get_config, get_hf_text_config
-from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
+from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron, is_tpu
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -748,6 +748,8 @@ class DeviceConfig:
             # Automated device type detection
             if is_neuron():
                 self.device_type = "neuron"
+            elif is_tpu():
+                self.device_type = "tpu"
             elif is_cpu():
                 self.device_type = "cpu"
             else:
@@ -761,6 +763,8 @@ class DeviceConfig:
         # Some device types require processing inputs on CPU
         if self.device_type in ["neuron"]:
             self.device = torch.device("cpu")
+        elif self.device_type in ["tpu"]:
+            self.device = None
         else:
             # Set device with device type
             self.device = torch.device(self.device_type)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cd29db7d7..227de5475 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -504,7 +504,7 @@ class EngineArgs:
         parser.add_argument("--device",
                             type=str,
                             default=EngineArgs.device,
-                            choices=["auto", "cuda", "neuron", "cpu"],
+                            choices=["auto", "cuda", "neuron", "cpu", "tpu"],
                             help='Device type for vLLM execution.')
 
         # Related to Vision-language models such as llava
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index aa1f07b5b..943402c86 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -375,6 +375,9 @@ class AsyncLLMEngine:
         if engine_config.device_config.device_type == "neuron":
             from vllm.executor.neuron_executor import NeuronExecutorAsync
             executor_class = NeuronExecutorAsync
+        elif engine_config.device_config.device_type == "tpu":
+            from vllm.executor.tpu_executor import TPUExecutorAsync
+            executor_class = TPUExecutorAsync
         elif engine_config.device_config.device_type == "cpu":
             assert distributed_executor_backend is None, (
                 "Distributed execution is not supported with the CPU backend.")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4f56bbd5c..ea7547584 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -341,6 +341,9 @@ class LLMEngine:
         if engine_config.device_config.device_type == "neuron":
             from vllm.executor.neuron_executor import NeuronExecutor
             executor_class = NeuronExecutor
+        elif engine_config.device_config.device_type == "tpu":
+            from vllm.executor.tpu_executor import TPUExecutor
+            executor_class = TPUExecutor
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
diff --git a/vllm/envs.py b/vllm/envs.py
index f0513b9af..f03b69f4b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -217,6 +218,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     # Default is 5 seconds
     "VLLM_IMAGE_FETCH_TIMEOUT":
     lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
+
+    # Path to the XLA persistent cache directory.
+    # Only used for XLA devices such as TPUs.
+    "VLLM_XLA_CACHE_PATH":
+    lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
 }
 
 # end-env-vars-definition
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
new file mode 100644
index 000000000..7061ad85f
--- /dev/null
+++ b/vllm/executor/tpu_executor.py
@@ -0,0 +1,101 @@
+from typing import List, Set, Tuple
+
+import torch
+
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+logger = init_logger(__name__)
+
+
+class TPUExecutor(ExecutorBase):
+
+    def _init_executor(self) -> None:
+        assert not self.scheduler_config.chunked_prefill_enabled, (
+            "Chunked prefill is not yet supported for TPU backend")
+        assert not self.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+        if self.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", self.model_config.dtype)
+            self.model_config.dtype = torch.bfloat16
+
+        # Instantiate the worker and load the model to the device.
+        self._init_worker()
+
+    def _init_worker(self):
+        from vllm.worker.tpu_worker import TPUWorker
+
+        assert self.parallel_config.world_size == 1, (
+            "TPUExecutor currently only supports a single TPU chip.")
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = TPUWorker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            self.cache_config,
+            self.load_config,
+            self.vision_language_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def initialize_cache(
+        self,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+    ) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def check_health(self) -> None:
+        # TPUExecutor will always be healthy as long as it's running.
+        return
+
+
+class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        sexecute_model_req: ExecuteModelRequest,
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(sexecute_model_req)
+        return output
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 1d49213cd..56aa629ae 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from vllm.utils import is_cpu, is_hip
+from vllm.utils import is_cpu, is_hip, is_tpu
 
 
 class CustomOp(nn.Module):
@@ -56,5 +56,7 @@ class CustomOp(nn.Module):
             return self.forward_hip
         elif is_cpu():
             return self.forward_cpu
+        elif is_tpu():
+            return self.forward_tpu
         else:
             return self.forward_cuda
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d2652106b..792c47293 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -28,6 +28,7 @@ import torch
 import torch.nn as nn
 
 from vllm.model_executor.custom_op import CustomOp
+from vllm.utils import is_tpu
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -43,6 +44,19 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
     return x.flatten(-2)
 
 
+def _apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> torch.Tensor:
+    x_ = torch.view_as_complex(
+        torch.stack(torch.chunk(x.transpose(1, 2).float(), 2, dim=-1), dim=-1))
+    x_out = torch.view_as_real(x_ * freqs_cis).type_as(x)
+    x_out = torch.cat(torch.chunk(x_out, 2, dim=-1), dim=-2)
+    x_out = x_out.reshape(x_out.shape[0], x_out.shape[1], x_out.shape[2],
+                          -1).transpose(1, 2)
+    return x_out
+
+
 class RotaryEmbedding(CustomOp):
     """Original rotary positional embedding."""
 
@@ -64,8 +78,14 @@ class RotaryEmbedding(CustomOp):
         self.dtype = dtype
 
         cache = self._compute_cos_sin_cache()
-        cache = cache.to(dtype)
-        self.register_buffer("cos_sin_cache", cache, persistent=False)
+        self.use_native2 = is_tpu() and is_neox_style
+        if not self.use_native2:
+            cache = cache.to(dtype)
+            self.register_buffer("cos_sin_cache", cache, persistent=False)
+        else:
+            cos, sin = cache.chunk(2, dim=-1)
+            freqs_cis = cos + 1j * sin
+            self.register_buffer("freqs_cis", freqs_cis, persistent=False)
 
     def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
         """Compute the inverse frequency."""
@@ -100,7 +120,11 @@ class RotaryEmbedding(CustomOp):
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """PyTorch-native implementation equivalent to forward()."""
+        """A PyTorch-native implementation equivalent to forward().
+
+        This method mimics the implementation of the custom CUDA kernel
+        used in `forward_cuda()`.
+        """
         query = query.view(*query.shape[:-1], -1, self.head_size)
         key = key.view(*key.shape[:-1], -1, self.head_size)
 
@@ -138,6 +162,42 @@ class RotaryEmbedding(CustomOp):
         key = key.flatten(-2)
         return query, key
 
+    def forward_native2(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Another PyTorch-native implementation of forward().
+
+        This method might perform better than `forward_native()` when compiled.
+        """
+        if positions.dim() == 1:
+            batch_size = 1
+            seq_len = positions.shape[0]
+        else:
+            batch_size, seq_len = positions.shape
+        if offsets is not None:
+            positions = positions + offsets
+        freqs_cis = self.freqs_cis.index_select(0, positions.flatten())
+        freqs_cis = freqs_cis.view(batch_size, 1, seq_len, -1)
+
+        query_shape = query.shape
+        query = query.view(batch_size, seq_len, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = _apply_rotary_emb(query_rot, freqs_cis)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(batch_size, seq_len, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = _apply_rotary_emb(key_rot, freqs_cis)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
     def forward_cuda(
         self,
         positions: torch.Tensor,
@@ -161,6 +221,17 @@ class RotaryEmbedding(CustomOp):
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
+    def forward_tpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        forward_fn = (self.forward_native2
+                      if self.use_native2 else self.forward_native)
+        return forward_fn(positions, query, key, offsets)
+
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 9c2eaee2e..f4c3dcbac 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -34,6 +34,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import is_tpu
 
 logger = init_logger(__name__)
 
@@ -227,12 +228,26 @@ class DefaultModelLoader(BaseModelLoader):
         if self.load_config.load_format == LoadFormat.NPCACHE:
             # Currently np_cache only support *.bin checkpoints
             assert use_safetensors is False
-            return np_cache_weights_iterator(model_name_or_path,
-                                             self.load_config.download_dir,
-                                             hf_folder, hf_weights_files)
-        if use_safetensors:
-            return safetensors_weights_iterator(hf_weights_files)
-        return pt_weights_iterator(hf_weights_files)
+            weights_iterator = np_cache_weights_iterator(
+                model_name_or_path, self.load_config.download_dir, hf_folder,
+                hf_weights_files)
+        elif use_safetensors:
+            weights_iterator = safetensors_weights_iterator(hf_weights_files)
+        else:
+            weights_iterator = pt_weights_iterator(hf_weights_files)
+
+        if is_tpu():
+            # In PyTorch XLA, we should call `xm.mark_step` frequently so that
+            # not too many ops are accumulated in the XLA program.
+            import torch_xla.core.xla_model as xm
+
+            def _xla_weights_iterator(iterator: Generator):
+                for weights in iterator:
+                    yield weights
+                    xm.mark_step()
+
+            weights_iterator = _xla_weights_iterator(weights_iterator)
+        return weights_iterator
 
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
diff --git a/vllm/utils.py b/vllm/utils.py
index 54d446b23..af585929d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -146,6 +146,15 @@ def is_neuron() -> bool:
     return transformers_neuronx is not None
 
 
+@lru_cache(maxsize=None)
+def is_tpu() -> bool:
+    try:
+        import libtpu
+    except ImportError:
+        libtpu = None
+    return libtpu is not None
+
+
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
@@ -546,6 +555,11 @@ def maybe_expand_dim(tensor: torch.Tensor,
     return tensor
 
 
+def get_dtype_size(dtype: torch.dtype) -> int:
+    """Get the size of the data type in bytes."""
+    return torch.tensor([], dtype=dtype).element_size()
+
+
 def merge_dicts(dict1: Dict[Any, List[Any]],
                 dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
     """Merge 2 dicts that have key -> List of items.
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 2f0e59f7a..341b177d4 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,7 +6,8 @@ import torch
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
+                        is_pin_memory_available)
 
 logger = init_logger(__name__)
 
@@ -108,9 +109,5 @@ class CacheEngine:
             dtype = model_config.dtype
         else:
             dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-        dtype_size = _get_dtype_size(dtype)
+        dtype_size = get_dtype_size(dtype)
         return dtype_size * total
-
-
-def _get_dtype_size(dtype: torch.dtype) -> int:
-    return torch.tensor([], dtype=dtype).element_size()
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
new file mode 100644
index 000000000..5003d3b0c
--- /dev/null
+++ b/vllm/worker/tpu_model_runner.py
@@ -0,0 +1,525 @@
+import time
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch_xla.core.xla_model as xm
+
+from vllm.attention import AttentionMetadata, get_attn_backend
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SamplerOutput, SequenceGroupMetadata,
+                           SequenceOutput)
+from vllm.utils import make_tensor_with_pad
+
+logger = init_logger(__name__)
+
+_PAD_SLOT_ID = 0  # FIXME(woosuk)
+
+
+class TPUModelRunner:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.vision_language_config = vision_language_config
+
+        self.block_size = self.cache_config.block_size
+        self.max_num_blocks_per_seq = (self.model_config.max_model_len //
+                                       self.block_size)
+        self.block_tables = np.zeros(
+            (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq),
+            dtype=np.int32)
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.model_config.get_head_size(),
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.cache_config.cache_dtype,
+            self.block_size,
+            False,
+        )
+
+    def load_model(self) -> None:
+        self.device = self.device_config.device
+
+        model = get_model(
+            model_config=self.model_config,
+            load_config=self.load_config,
+            device_config=self.device_config,
+            parallel_config=self.parallel_config,
+            cache_config=self.cache_config,
+            scheduler_config=self.scheduler_config,
+            vision_language_config=self.vision_language_config,
+            lora_config=None,
+        )
+        xm.wait_device_ops()
+
+        model = ModelWrapper(model)
+        self.model = torch.compile(model, backend="openxla", fullgraph=True)
+
+    def _dummy_run(
+        self,
+        batch_size: int,
+        seq_len: int,
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        is_prompt: bool,
+    ) -> None:
+        if is_prompt:
+            seq_len = (seq_len + 15) // 16 * 16
+            token_ids = torch.zeros((batch_size, seq_len),
+                                    dtype=torch.int32,
+                                    device=self.device)
+            position_ids = torch.zeros((batch_size, seq_len),
+                                       dtype=torch.int32,
+                                       device=self.device)
+            slot_mapping = torch.zeros((batch_size, seq_len),
+                                       dtype=torch.int64,
+                                       device=self.device)
+            attn_metadata = self.attn_backend.make_metadata(
+                num_prefills=batch_size,
+                num_prefill_tokens=batch_size * seq_len,
+                num_decode_tokens=0,
+                slot_mapping=slot_mapping,
+                block_tables=None,
+                context_lens=None,
+            )
+            input_lens = torch.ones((batch_size, ),
+                                    dtype=torch.int32,
+                                    device=self.device)
+        else:
+            assert seq_len == 1
+            token_ids = torch.zeros((batch_size, seq_len),
+                                    dtype=torch.int32,
+                                    device=self.device)
+            position_ids = torch.zeros((batch_size, seq_len),
+                                       dtype=torch.int32,
+                                       device=self.device)
+            slot_mapping = torch.zeros((batch_size, seq_len),
+                                       dtype=torch.int64,
+                                       device=self.device)
+            block_tables = torch.zeros(
+                (batch_size, self.max_num_blocks_per_seq),
+                dtype=torch.int32,
+                device=self.device)
+            context_lens = torch.ones((batch_size, ),
+                                      dtype=torch.int32,
+                                      device=self.device)
+            input_lens = torch.ones((batch_size, ),
+                                    dtype=torch.int32,
+                                    device=self.device)
+            attn_metadata = self.attn_backend.make_metadata(
+                num_prefills=0,
+                num_prefill_tokens=0,
+                num_decode_tokens=batch_size * seq_len,
+                slot_mapping=slot_mapping,
+                block_tables=block_tables,
+                context_lens=context_lens,
+            )
+        t = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
+        p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
+
+        # Dummy run.
+        self.model(token_ids, position_ids, kv_caches, attn_metadata,
+                   input_lens, t, p)
+
+    def warmup_model(
+        self,
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> None:
+        # Prefill
+        logger.info("Compiling the model with different input shapes...")
+        start = time.time()
+        for batch_size in [1]:
+            seq_len = 16
+            while True:
+                self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=True)
+                xm.wait_device_ops()
+                logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
+
+                if seq_len >= self.model_config.max_model_len:
+                    break
+                num_tokens = batch_size * seq_len
+                if num_tokens >= self.scheduler_config.max_num_batched_tokens:
+                    break
+                seq_len = seq_len * 2
+
+        end = time.time()
+        logger.info("Compilation for prefill done in %.2f s.", end - start)
+
+        # Decode
+        start = time.time()
+        seq_len = 1
+        batch_size = 1
+        while True:
+            self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=False)
+            xm.wait_device_ops()
+            logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
+
+            if batch_size >= self.scheduler_config.max_num_seqs:
+                break
+            batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2
+
+        end = time.time()
+        logger.info("Compilation for decode done in %.2f s.", end - start)
+
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ):
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        prompt_lens: List[int] = []
+        slot_mapping: List[List[int]] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            # Could include output tokens when a request is preempted.
+            prompt_tokens = seq_data.get_token_ids()
+            prompt_len = len(prompt_tokens)
+            prompt_lens.append(prompt_len)
+
+            input_tokens.append(prompt_tokens)
+            input_positions.append(list(range(prompt_len)))
+
+            assert seq_group_metadata.block_tables is not None
+            block_table = seq_group_metadata.block_tables[seq_id]
+            slot_mapping.append([])
+            for i in range(prompt_len):
+                block_number = block_table[i // self.block_size]
+                block_offset = i % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping[-1].append(slot)
+
+        assert len(prompt_lens) > 0
+        num_prefills = len(prompt_lens)
+        num_prefill_tokens = sum(prompt_lens)
+
+        # Add paddings to make the shape [batch_size, max_prompt_len] where
+        # max_prompt_len is smallest power of 2 that is greater than or equal
+        # to the maximum prompt length.
+        # We need the 2D input shape because the Pallas FlashAttention kernel
+        # does not support packed 1D inputs.
+        # We pad the seq_len to powers of 2 to reduce the compilation overhead.
+        max_prompt_len = _get_padded_prefill_len(max(prompt_lens))
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                            max_prompt_len,
+                                            pad=0,
+                                            dtype=torch.int32,
+                                            device=self.device)
+        input_positions = make_tensor_with_pad(input_positions,
+                                               max_prompt_len,
+                                               pad=0,
+                                               dtype=torch.int32,
+                                               device=self.device)
+        slot_mapping = make_tensor_with_pad(slot_mapping,
+                                            max_prompt_len,
+                                            pad=_PAD_SLOT_ID,
+                                            dtype=torch.int64,
+                                            device=self.device)
+        prompt_lens = torch.tensor(prompt_lens,
+                                   dtype=torch.int32,
+                                   device=self.device)
+        attn_metadata = self.attn_backend.make_metadata(
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,  # NOTE: This is not used.
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            block_tables=None,
+            context_lens=None,
+        )
+        return input_tokens, input_positions, attn_metadata, prompt_lens
+
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ):
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        slot_mapping: List[List[int]] = []
+        context_lens: List[int] = []
+        num_seq_groups = len(seq_group_metadata_list)
+        batch_size = _get_padded_batch_size(num_seq_groups)
+
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            assert not seq_group_metadata.is_prompt
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append([generation_token])
+
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append([position])
+                context_lens.append(seq_len)
+
+                assert seq_group_metadata.block_tables is not None
+                block_table = seq_group_metadata.block_tables[seq_id]
+                self.block_tables[i, :len(block_table)] = block_table
+
+                block_number = block_table[position // self.block_size]
+                block_offset = position % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append([slot])
+
+        num_paddings = batch_size - num_seq_groups
+        input_tokens = input_tokens + [[0]] * num_paddings
+        input_positions = input_positions + [[0]] * num_paddings
+        slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings
+        context_lens = context_lens + [0] * num_paddings
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.int32,
+                                    device=self.device)
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.int32,
+                                       device=self.device)
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.int64,
+                                    device=self.device)
+        context_lens = torch.tensor(context_lens,
+                                    dtype=torch.int32,
+                                    device=self.device)
+        block_tables = torch.tensor(self.block_tables[:batch_size],
+                                    dtype=torch.int32,
+                                    device=self.device)
+        input_lens = torch.tensor([1] * batch_size,
+                                  dtype=torch.int32,
+                                  device=self.device)
+        attn_metadata = self.attn_backend.make_metadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=batch_size,
+            slot_mapping=slot_mapping,
+            block_tables=block_tables,
+            context_lens=context_lens,
+        )
+        return input_tokens, input_positions, attn_metadata, input_lens
+
+    def _prepare_sample(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        padded_batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert len(seq_group_metadata_list) > 0
+        t = []
+        p = []
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.sampling_params is not None
+            sampling_params = seq_group_metadata.sampling_params
+
+            t.append(sampling_params.temperature
+                     if sampling_params.temperature >= 1e-5 else 1e-5)
+            p.append(sampling_params.top_p)
+        num_paddings = padded_batch_size - len(seq_group_metadata_list)
+        t += [1.0] * num_paddings
+        p += [1.0] * num_paddings
+
+        t = torch.tensor(t, dtype=torch.float32, device=self.device)
+        p = torch.tensor(p, dtype=torch.float32, device=self.device)
+        return t, p
+
+    def prepare_inputs(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ):
+        assert seq_group_metadata_list is not None
+        assert len(seq_group_metadata_list) > 0
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        if seq_group_metadata_list[0].is_prompt:
+            inputs = self._prepare_prompt(seq_group_metadata_list)
+        else:
+            inputs = self._prepare_decode(seq_group_metadata_list)
+        padded_batch_size = inputs[0].shape[0]
+        sample_inputs = self._prepare_sample(seq_group_metadata_list,
+                                             padded_batch_size)
+        return inputs + sample_inputs
+
+    def _execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> List[CompletionSequenceGroupOutput]:
+        inputs = self.prepare_inputs(seq_group_metadata_list)
+        next_token_ids = self.model(inputs[0], inputs[1], kv_caches,
+                                    *inputs[2:])
+        next_token_ids = next_token_ids.cpu().tolist()
+
+        i = 0
+        sampler_outputs = []
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_outputs = []
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            for seq_id in seq_ids:
+                next_token_id = next_token_ids[i]
+                seq_outputs.append(
+                    SequenceOutput(seq_id, next_token_id,
+                                   {next_token_id: Logprob(0.0)}))
+                i += 1
+            sampler_outputs.append(
+                CompletionSequenceGroupOutput(seq_outputs, None))
+        return sampler_outputs
+
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> SamplerOutput:
+        assert seq_group_metadata_list is not None
+        if seq_group_metadata_list[0].is_prompt:
+            # NOTE(woosuk): To reduce the compilation time, we only compile the
+            # prefill inputs with batch size 1. Because the scheduler is not
+            # aware of this limitation, we need to handle batch size > 1
+            # internally by calling the model multiple times and concatenating
+            # the outputs.
+            # FIXME(woosuk): This is a temporary hack to not change the existing
+            # scheduler. We need to fix this in the future.
+            sampler_outputs = []
+            for seq_group_metadata in seq_group_metadata_list:
+                sampler_outputs += self._execute_model([seq_group_metadata],
+                                                       kv_caches)
+        else:
+            sampler_outputs = self._execute_model(seq_group_metadata_list,
+                                                  kv_caches)
+        return SamplerOutput(sampler_outputs)
+
+
+class ModelWrapper(nn.Module):
+
+    def __init__(self, model: nn.Module):
+        super().__init__()
+        self.model = model.eval()
+
+    def forward(
+        self,
+        token_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        kv_caches: List[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]],
+        attn_metadata: AttentionMetadata,
+        input_lens: torch.Tensor,
+        t: torch.Tensor,
+        p: torch.Tensor,
+    ) -> torch.Tensor:
+        """Executes the forward pass of the model and samples the next token.
+
+        Args:
+            token_ids: The input token IDs of shape [batch_size, seq_len].
+            position_ids: The input position IDs of shape [batch_size, seq_len].
+            kv_caches: The key and value caches. They can be None during the
+                memory profiling at initialization.
+            attn_metadata: The Pallas attention metadata.
+            input_lens: The actual input lengths of shape [batch_size].
+            t: The sampling temperature of shape [batch_size].
+            p: The top-p probability of shape [batch_size].
+        """
+        batch_size, seq_len = token_ids.shape
+        # Calculate the positions to sample from.
+        base_indicies = torch.arange(
+            batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
+        logits_indices = base_indicies + input_lens - 1
+
+        # FIXME(woosuk): This is a temporary hack to avoid using the existing
+        # sampler and sampling metadata.
+        sampling_metadata = SamplingMetadata(
+            seq_groups=[],
+            selected_token_indices=logits_indices,
+            categorized_sample_indices={},
+            num_prompts=attn_metadata.num_prefills,
+        )
+
+        # Skip this in memory profiling at initialization.
+        if kv_caches[0][0] is not None:
+            # index_copy_(slot_mapping) only works when the inserted dimension
+            # is 0. However, the KV cache in the Pallas backend has the shape
+            # [num_kv_heads, num_blocks, block_size, head_size]. To make it
+            # work, we need to flatten the first three dimensions and modify
+            # the slot_mapping accordingly.
+            num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
+            slot_mapping = attn_metadata.slot_mapping
+            slot_mapping = slot_mapping.flatten()
+            head_indicies = torch.arange(0,
+                                         num_kv_heads,
+                                         device=slot_mapping.device,
+                                         dtype=slot_mapping.dtype)
+            head_indicies *= block_size * num_blocks
+            slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
+                -1, num_kv_heads)
+            slot_mapping = slot_mapping + head_indicies.view(1, -1)
+            slot_mapping = slot_mapping.flatten()
+            attn_metadata.slot_mapping = slot_mapping
+
+        hidden_states = self.model(
+            token_ids,
+            position_ids,
+            kv_caches,
+            attn_metadata,
+        )
+        hidden_states = hidden_states.flatten(0, 1)
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        logits = logits / t.unsqueeze(dim=1)
+        # FIXME(woosuk): Disabled top-p sampling since it's too slow.
+        # logits = _apply_top_p(logits, p.unsqueeze(dim=1))
+        probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
+        # FIXME(woosuk): best_of > 1 is not supported.
+        next_token_ids = torch.multinomial(probs, num_samples=1).squeeze(dim=1)
+        return next_token_ids
+
+
+def _get_padded_prefill_len(x: int) -> int:
+    # NOTE(woosuk): The pallas FlashAttention kernel requires the sequence
+    # length to be a multiple of 16. We pad the prompt length to the nearest
+    # multiple of 16. This is also good for performance.
+    if x <= 16:
+        return 16
+    return 1 << (x - 1).bit_length()
+
+
+def _get_padded_batch_size(batch_size: int) -> int:
+    if batch_size <= 2:
+        return batch_size
+    elif batch_size <= 4:
+        return 4
+    elif batch_size <= 8:
+        return 8
+    else:
+        return ((batch_size + 15) // 16) * 16
+
+
+def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor:
+    logits_sorted = torch.sort(logits, dim=-1, descending=True).values
+    sorted_cum_probs = torch.cumsum(logits_sorted.softmax(dim=-1), dim=-1)
+    cutoff_index = torch.sum(sorted_cum_probs < p, dim=-1, keepdim=True)
+    cutoff_logit = torch.gather(logits_sorted, -1, cutoff_index)
+    logits = logits.masked_fill_(logits < cutoff_logit, -float("inf"))
+    return logits
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
new file mode 100644
index 000000000..04576015d
--- /dev/null
+++ b/vllm/worker/tpu_worker.py
@@ -0,0 +1,198 @@
+import os
+from typing import List, Optional, Tuple
+
+import torch
+import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
+
+import vllm.envs as envs
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
+from vllm.worker.tpu_model_runner import TPUModelRunner
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+
+logger = init_logger(__name__)
+
+
+class TPUWorker(LoraNotSupportedWorkerBase):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        vision_language_config: Optional[VisionLanguageConfig],
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.vision_language_config = vision_language_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+
+        assert self.device_config.device_type == "tpu"
+        if self.cache_config.cache_dtype == "auto":
+            self.cache_dtype = self.model_config.dtype
+        else:
+            self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                self.cache_config.cache_dtype]
+
+        self.model_runner = TPUModelRunner(model_config, parallel_config,
+                                           scheduler_config, device_config,
+                                           cache_config, load_config,
+                                           vision_language_config)
+
+    def init_device(self) -> None:
+        os.environ["PJRT_DEVICE"] = "TPU"
+        self.device = xm.xla_device()
+        self.device_config.device = self.device
+        torch.set_grad_enabled(False)
+        torch.set_default_dtype(self.model_config.dtype)
+
+        # NOTE(woosuk): This is just a hack to initialize the TP group.
+        # This cannot perform the actual communication ops.
+        init_distributed_environment(
+            world_size=self.parallel_config.world_size,
+            rank=self.rank,
+            local_rank=self.local_rank,
+            distributed_init_method=self.distributed_init_method,
+            backend="gloo",
+        )
+        ensure_model_parallel_initialized(
+            self.parallel_config.tensor_parallel_size,
+            self.parallel_config.pipeline_parallel_size)
+
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+        xm.set_rng_state(self.model_config.seed, self.device)
+
+        # Increase the cache size limit, which is the maximum number of
+        # dynamo graphs that can be compiled.
+        # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and
+        # 30-40 graphs for decode. 128 is an arbitrary safe number.
+        torch._dynamo.config.cache_size_limit = 128
+        # Use persistent cache to avoid XLA recompilation.
+        # NOTE(woosuk): This does not completely eliminate the recompilation
+        # overhead because dynamo does not cache the compiled results.
+        xr.initialize_cache(os.path.expanduser(envs.VLLM_XLA_CACHE_PATH),
+                            readonly=False)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        head_size = self.model_config.get_head_size()
+        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
+
+        kv_caches = [(None, None) for _ in range(num_layers)]
+        self.model_runner._dummy_run(
+            batch_size=1,
+            seq_len=self.scheduler_config.max_num_batched_tokens,
+            kv_caches=kv_caches,
+            is_prompt=True,
+        )
+        # Synchronize before measuring the memory usage.
+        xm.wait_device_ops()
+
+        m = xm.get_memory_info(self.device)
+        program_size = 1024 * 1024 * 1024  # 1GB
+        free_bytes = max(m["bytes_limit"] - m["bytes_used"] - program_size, 0)
+        kv_cache_bytes = int(free_bytes *
+                             self.cache_config.gpu_memory_utilization)
+        kv_cache_dtype_btyes = get_dtype_size(self.cache_dtype)
+        block_size = self.cache_config.block_size
+        num_tpu_blocks = (kv_cache_bytes //
+                          (kv_cache_dtype_btyes * block_size * num_layers * 2 *
+                           head_size * num_kv_heads))
+        num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.
+        return num_tpu_blocks, 0
+
+    def initialize_cache(
+        self,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+    ) -> None:
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+        self.block_size = self.cache_config.block_size
+
+        dtype = self.cache_dtype
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
+        head_size = self.model_config.get_head_size()
+
+        self.tpu_cache = []
+        tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
+            num_gpu_blocks, self.block_size, num_kv_heads, head_size)
+        for _ in range(num_layers):
+            key_cache = torch.zeros(tpu_cache_shape,
+                                    dtype=dtype,
+                                    device=self.device)
+            value_cache = torch.zeros_like(key_cache)
+            self.tpu_cache.append((key_cache, value_cache))
+        self._warmup_model()
+
+    def _warmup_model(self) -> None:
+        # FIXME(woosuk): Here we are abusing `enforce_eager` which is defined
+        # for CUDA graphs. We should refactor this part.
+        if not self.model_config.enforce_eager:
+            # Warm up the model with all possible input shapes so that
+            # compilation never happens during the actual execution.
+            # This may take ~30 mins for the first run and ~20 mins for the
+            # subsequent runs.
+            # If `enforce_eager` is True, the ahead-of-time compilation is
+            # skipped and the compilation happens during the actual execution,
+            # which is bad for performance but useful for development.
+            self.model_runner.warmup_model(self.tpu_cache)
+
+    def get_cache_block_size_bytes(self) -> int:
+        head_size = self.model_config.get_head_size()
+        num_heads = self.model_config.get_num_kv_heads(self.parallel_config)
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+
+        key_cache_block = self.cache_config.block_size * num_heads * head_size
+        value_cache_block = key_cache_block
+        total = num_layers * (key_cache_block + value_cache_block)
+        dtype_size = get_dtype_size(self.cache_dtype)
+        return dtype_size * total
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        if execute_model_req is None:
+            return []
+
+        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+        num_seq_groups = len(seq_group_metadata_list)
+        if num_seq_groups == 0:
+            return []
+
+        # Currently, TPUWorker does not support swapping.
+        # TODO(woosuk): Support block copying.
+        assert len(execute_model_req.blocks_to_swap_in) == 0, (
+            "Swapping is not supported for the TPU backend.")
+        assert len(execute_model_req.blocks_to_swap_out) == 0, (
+            "Swapping is not supported for the TPU backend.")
+        assert len(execute_model_req.blocks_to_copy) == 0
+
+        output = self.model_runner.execute_model(seq_group_metadata_list,
+                                                 self.tpu_cache)
+        return [output]
-- 
GitLab


From c3c2903e72c6e85a81ff6de8b879f4c82e8ad364 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 13 Jun 2024 03:58:53 +0800
Subject: [PATCH 005/376] [Bugfix] Add device assertion to TorchSDPA (#5402)

---
 vllm/attention/selector.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 3f0e29c73..8b07fb2d7 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -58,6 +58,9 @@ def get_attn_backend(
             ROCmFlashAttentionBackend)
         return ROCmFlashAttentionBackend
     elif backend == _Backend.TORCH_SDPA:
+        # TODO: make XPU backend available here.
+        assert is_cpu(), RuntimeError(
+            "Torch SDPA backend is only used for the CPU device.")
         logger.info("Using Torch SDPA backend.")
         from vllm.attention.backends.torch_sdpa import TorchSDPABackend
         return TorchSDPABackend
-- 
GitLab


From 8b82a89997826af8e0e4ecfaaed60f3b28b1baed Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 12 Jun 2024 14:00:18 -0700
Subject: [PATCH 006/376] [ci] Add AMD, Neuron, Intel tests for AWS CI and turn
 off default soft fail for GPU tests (#5464)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-template-aws.j2 | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 3b5d36b24..645747ddd 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -19,6 +19,34 @@ steps:
           limit: 5
   - wait
 
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        soft_fail: true
+    {% endif %}
+    {% endfor %}
+
+  - label: "Neuron Test"
+    depends_on: ~
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+    soft_fail: false
+
+  - label: "Intel Test"
+    depends_on: ~
+    agents:
+      queue: intel
+    command: bash .buildkite/run-cpu-test.sh
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
@@ -31,7 +59,7 @@ steps:
       {% else %}
       queue: gpu_1_queue
       {% endif %}
-    soft_fail: true
+    soft_fail: {{ step.soft_fail or false }}
     {% if step.parallelism %}
     parallelism: {{ step.parallelism }}
     {% endif %}
-- 
GitLab


From 5985e3427dc4a10b8483fd08013fa8df563f04fb Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 12 Jun 2024 14:07:26 -0700
Subject: [PATCH 007/376] [Kernel] Vectorized FP8 quantize kernel (#5396)

Inspired by #5146, this PR improves FP8 quantize kernel by vectorizing data transfer to better utilize memory bandwidth. Microbenchmark shows that this improved kernel can achieve 1.0x-1.5x speedup (especially when hidden size is large).

In details, we applied 3 optimizations:

- Use inverted scale so that most divisions are changed to multiplications.
- Unroll the loop by 4 times to improve ILP.
- Use vectorized 4 to transfer data between HBM and SRAM.
---
 csrc/quantization/fp8/common.cu | 53 +++++++++++++++++++++++++++++----
 tests/quantization/test_fp8.py  | 47 +++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 6 deletions(-)

diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 8c5b693bf..6120086d7 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -23,8 +23,8 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
 
 template <typename scalar_t>
 __device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion(
-    const scalar_t val, const float scale) {
-  float x = static_cast<float>(val) / scale;
+    const scalar_t val, const float inverted_scale) {
+  float x = static_cast<float>(val) * inverted_scale;
   float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
   return static_cast<c10::Float8_e4m3fn>(r);
 }
@@ -71,15 +71,56 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   }
 }
 
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+
+typedef struct __align__(4) {
+  c10::Float8_e4m3fn x;
+  c10::Float8_e4m3fn y;
+  c10::Float8_e4m3fn z;
+  c10::Float8_e4m3fn w;
+}
+float8x4_t;
+
 template <typename scalar_t>
 __global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out,
                                         const scalar_t* __restrict__ input,
                                         const float* __restrict__ scale,
                                         int64_t num_elems) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  while (i < num_elems) {
-    out[i] = scaled_fp8_conversion(input[i], *scale);
-    i += blockDim.x * gridDim.x;
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Invert the scale so that we can use multiplications to avoid expensive
+  // division.
+  const float inverted_scale = 1.0f / (*scale);
+
+  // Vectorized input/output to better utilize memory bandwidth.
+  const vec4_t<scalar_t>* vectorized_in =
+      reinterpret_cast<const vec4_t<scalar_t>*>(input);
+  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+
+  int num_vec_elems = num_elems >> 2;
+
+#pragma unroll 4
+  for (int i = tid; i < num_vec_elems; i += blockDim.x * gridDim.x) {
+    vec4_t<scalar_t> in_vec = vectorized_in[i];
+    float8x4_t out_vec;
+
+    out_vec.x = scaled_fp8_conversion(in_vec.x, inverted_scale);
+    out_vec.y = scaled_fp8_conversion(in_vec.y, inverted_scale);
+    out_vec.z = scaled_fp8_conversion(in_vec.z, inverted_scale);
+    out_vec.w = scaled_fp8_conversion(in_vec.w, inverted_scale);
+    vectorized_out[i] = out_vec;
+  }
+
+  // Handle the remaining elements if num_elems is not divisible by 4
+  for (int i = num_vec_elems * 4 + tid; i < num_elems;
+       i += blockDim.x * gridDim.x) {
+    out[i] = scaled_fp8_conversion(input[i], inverted_scale);
   }
 }
 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index fccce7f7b..7cb65326c 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
 import pytest
 import torch
 
+from vllm._custom_ops import scaled_fp8_quant
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 
@@ -22,3 +23,49 @@ def test_load_fp16_model(vllm_runner) -> None:
         fc1 = model.model.decoder.layers[0].fc1
         assert isinstance(fc1.quant_method, Fp8LinearMethod)
         assert fc1.weight.dtype == torch.float8_e4m3fn
+
+
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_scaled_fp8_quant(dtype) -> None:
+
+    def quantize_ref(tensor, inv_scale):
+        # The reference implementation that fully aligns to
+        # the kernel being tested.
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        scale = inv_scale.reciprocal()
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
+                                                           max=finfo.max)
+        qweight = qweight.to(torch.float8_e4m3fn)
+        return qweight
+
+    def per_tensor_dequantize(tensor, inv_scale, dtype):
+        fake_qweight = tensor.to(dtype)
+        dq_weight = fake_qweight * inv_scale
+        return dq_weight
+
+    # Note that we use a shape % 4 != 0 to cover edge cases,
+    # because scaled_fp8_quant is vectorized by 4.
+    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
+
+    # Dynamic quantization
+    ref_y, inv_scale = scaled_fp8_quant(x, None)
+    ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
+
+    # Reference dynamic quantizaton
+    y = quantize_ref(x, inv_scale)
+    assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
+
+    # Static quantization
+    y, _ = scaled_fp8_quant(x, inv_scale)
+    assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
+
+    # Padding
+    y, _ = scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    assert y.shape[0] == 17
+    assert torch.allclose(
+        ref_y,
+        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
+                              dtype))
-- 
GitLab


From 5cc50a531f720758025c8493ee85a56272277a54 Mon Sep 17 00:00:00 2001
From: Arthur Kim <kimdwkimdw@gmail.com>
Date: Thu, 13 Jun 2024 06:08:52 +0900
Subject: [PATCH 008/376] [Bugfix] TYPE_CHECKING for MultiModalData (#5444)

---
 vllm/inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/inputs.py b/vllm/inputs.py
index 85c9cd84f..026903e19 100644
--- a/vllm/inputs.py
+++ b/vllm/inputs.py
@@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
 from typing_extensions import NotRequired
 
 if TYPE_CHECKING:
-    from vllm.sequence import MultiModalData
+    from vllm.multimodal import MultiModalData
 
 
 class ParsedText(TypedDict):
-- 
GitLab


From 51602eefd38250325e541abd28f051ffd7676c3f Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 12 Jun 2024 15:13:52 -0600
Subject: [PATCH 009/376]  [Frontend] [Core] Support for sharded tensorized
 models (#4990)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Sanger Steel <sangersteel@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 examples/tensorize_vllm_model.py              | 125 +++++++++---------
 tests/tensorizer_loader/test_tensorizer.py    |  99 ++++++++++++--
 vllm/model_executor/model_loader/loader.py    |  18 ++-
 .../model_executor/model_loader/tensorizer.py | 107 ++++++++++-----
 vllm/worker/model_runner.py                   |  11 ++
 vllm/worker/worker.py                         |   8 ++
 6 files changed, 261 insertions(+), 107 deletions(-)

diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index 8b74ae1d7..f9ed5fe08 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -3,18 +3,12 @@ import dataclasses
 import json
 import os
 import uuid
-from functools import partial
-
-from tensorizer import stream_io
 
 from vllm import LLM
-from vllm.distributed import (init_distributed_environment,
-                              initialize_model_parallel)
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
                                                          TensorizerConfig,
-                                                         serialize_vllm_model)
+                                                         tensorize_vllm_model)
 
 # yapf conflicts with isort for this docstring
 # yapf: disable
@@ -61,6 +55,12 @@ Which downloads the model tensors from your S3 bucket and deserializes them.
 You can also provide a `--keyfile` argument to decrypt the model weights if 
 they were serialized with encryption.
 
+To support distributed tensor-parallel models, each model shard will be
+serialized to a separate file. The tensorizer_uri is then specified as a string
+template with a format specifier such as '%03d' that will be rendered with the
+shard's rank. Sharded models serialized with this script will be named as
+model-rank-%03d.tensors
+
 For more information on the available arguments for serializing, run 
 `python -m examples.tensorize_vllm_model serialize --help`.
 
@@ -168,77 +168,72 @@ def parse_args():
 def deserialize():
     llm = LLM(model=args.model,
               load_format="tensorizer",
+              tensor_parallel_size=args.tensor_parallel_size,
               model_loader_extra_config=tensorizer_config
     )
     return llm
 
 
+if __name__ == '__main__':
+    args = parse_args()
 
-args = parse_args()
-
-s3_access_key_id = (getattr(args, 's3_access_key_id', None)
-                    or os.environ.get("S3_ACCESS_KEY_ID", None))
-s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
-                        or os.environ.get("S3_SECRET_ACCESS_KEY", None))
-s3_endpoint = (getattr(args, 's3_endpoint', None)
-               or os.environ.get("S3_ENDPOINT_URL", None))
-
-credentials = {
-    "s3_access_key_id": s3_access_key_id,
-    "s3_secret_access_key": s3_secret_access_key,
-    "s3_endpoint": s3_endpoint
-}
+    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
+                        or os.environ.get("S3_ACCESS_KEY_ID", None))
+    s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
+                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
+    s3_endpoint = (getattr(args, 's3_endpoint', None)
+                or os.environ.get("S3_ENDPOINT_URL", None))
 
-_read_stream, _write_stream = (partial(
-    stream_io.open_stream,
-    mode=mode,
-    s3_access_key_id=s3_access_key_id,
-    s3_secret_access_key=s3_secret_access_key,
-    s3_endpoint=s3_endpoint,
-) for mode in ("rb", "wb+"))
+    credentials = {
+        "s3_access_key_id": s3_access_key_id,
+        "s3_secret_access_key": s3_secret_access_key,
+        "s3_endpoint": s3_endpoint
+    }
 
-model_ref = args.model
+    model_ref = args.model
 
-model_name = model_ref.split("/")[1]
+    model_name = model_ref.split("/")[1]
 
-os.environ["MASTER_ADDR"] = "127.0.0.1"
-os.environ["MASTER_PORT"] = "8080"
+    keyfile = args.keyfile if args.keyfile else None
 
-init_distributed_environment(world_size=1, rank=0, local_rank=0)
-initialize_model_parallel()
+    if args.model_loader_extra_config:
+        config = json.loads(args.model_loader_extra_config)
+        tensorizer_args = \
+            TensorizerConfig(**config)._construct_tensorizer_args()
+        tensorizer_args.tensorizer_uri = args.path_to_tensors
+    else:
+        tensorizer_args = None
 
-keyfile = args.keyfile if args.keyfile else None
+    if args.command == "serialize":
+        eng_args_dict = {f.name: getattr(args, f.name) for f in
+                        dataclasses.fields(EngineArgs)}
 
+        engine_args = EngineArgs.from_cli_args(
+            argparse.Namespace(**eng_args_dict)
+        )
 
-if args.model_loader_extra_config:
-    config = json.loads(args.model_loader_extra_config)
-    tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
-    tensorizer_args.tensorizer_uri = args.path_to_tensors
-else:
-    tensorizer_args = None
-
-if args.command == "serialize":
-    eng_args_dict = {f.name: getattr(args, f.name) for f in
-                     dataclasses.fields(EngineArgs)}
-
-    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
-    engine = LLMEngine.from_engine_args(engine_args)
+        input_dir = args.serialized_directory.rstrip('/')
+        suffix = args.suffix if args.suffix else uuid.uuid4().hex
+        base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+        if engine_args.tensor_parallel_size > 1:
+            model_path = f"{base_path}/model-rank-%03d.tensors"
+        else:
+            model_path = f"{base_path}/model.tensors"
 
-    input_dir = args.serialized_directory.rstrip('/')
-    suffix = args.suffix if args.suffix else uuid.uuid4().hex
-    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
-    model_path = f"{base_path}/model.tensors"
-    tensorizer_config = TensorizerConfig(
-        tensorizer_uri=model_path,
-        **credentials)
-    serialize_vllm_model(engine, tensorizer_config, keyfile)
-elif args.command == "deserialize":
-    if not tensorizer_args:
         tensorizer_config = TensorizerConfig(
-            tensorizer_uri=args.path_to_tensors,
-            encryption_keyfile = keyfile,
-            **credentials
-        )
-    deserialize()
-else:
-    raise ValueError("Either serialize or deserialize must be specified.")
+            tensorizer_uri=model_path,
+            encryption_keyfile=keyfile,
+            **credentials)
+
+        tensorize_vllm_model(engine_args, tensorizer_config)
+
+    elif args.command == "deserialize":
+        if not tensorizer_args:
+            tensorizer_config = TensorizerConfig(
+                tensorizer_uri=args.path_to_tensors,
+                encryption_keyfile = keyfile,
+                **credentials
+            )
+        deserialize()
+    else:
+        raise ValueError("Either serialize or deserialize must be specified.")
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 3f2017452..9656cf5f4 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,21 +1,27 @@
 import json
 import os
+import pathlib
 import subprocess
 from unittest.mock import MagicMock, patch
 
 import openai
 import pytest
 import ray
+import torch
+from tensorizer import EncryptionParams
 
 from vllm import SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          TensorSerializer,
                                                          is_vllm_tensorized,
                                                          load_with_tensorizer,
                                                          open_stream,
-                                                         serialize_vllm_model)
+                                                         serialize_vllm_model,
+                                                         tensorize_vllm_model)
 
+from ..conftest import VllmRunner, cleanup
 from ..utils import ServerRunner
 
 # yapf conflicts with isort for this docstring
@@ -42,6 +48,20 @@ def is_curl_installed():
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
 
+def get_torch_model(vllm_runner: VllmRunner):
+    return vllm_runner \
+            .model \
+            .llm_engine \
+            .model_executor \
+            .driver_worker \
+            .model_runner \
+            .model
+
+def write_keyfile(keyfile_path: str):
+    encryption_params = EncryptionParams.random()
+    pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(keyfile_path, 'wb') as f:
+        f.write(encryption_params.key)
 
 @pytest.fixture(autouse=True)
 def tensorizer_config():
@@ -88,12 +108,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
     with vllm_runner(model_ref) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
         key_path = tmp_path / (model_ref + ".key")
+        write_keyfile(key_path)
+
         outputs = vllm_model.generate(prompts, sampling_params)
 
-        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
-        serialize_vllm_model(vllm_model.model.llm_engine,
-                            config_for_serializing,
-                            encryption_key_path=key_path)
+        config_for_serializing = TensorizerConfig(
+            tensorizer_uri=model_path,
+            encryption_keyfile=key_path
+        )
+        serialize_vllm_model(get_torch_model(vllm_model),
+                            config_for_serializing)
+
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
@@ -145,7 +170,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     with vllm_runner(model_ref, ) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
-        serialize_vllm_model(vllm_model.model.llm_engine,
+        serialize_vllm_model(get_torch_model(vllm_model),
                             TensorizerConfig(tensorizer_uri=model_path))
 
     with vllm_runner(
@@ -180,7 +205,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     with vllm_runner(model_ref, ) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
-        serialize_vllm_model(vllm_model.model.llm_engine,
+        serialize_vllm_model(get_torch_model(vllm_model),
                             TensorizerConfig(tensorizer_uri=model_path))
 
         model_loader_extra_config = {
@@ -224,7 +249,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
 
 
-def test_tensorizer_with_tp(vllm_runner):
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Requires 2 GPUs")
+def test_tensorizer_with_tp_path_without_template(vllm_runner):
     with pytest.raises(ValueError):
         model_ref = "EleutherAI/pythia-1.4b"
         tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
@@ -238,8 +265,62 @@ def test_tensorizer_with_tp(vllm_runner):
                 s3_endpoint="object.ord1.coreweave.com",
             ),
             tensor_parallel_size=2,
+            disable_custom_all_reduce=True,
         )
 
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Requires 2 GPUs")
+def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
+                                                                    tmp_path):
+    model_ref = "EleutherAI/pythia-1.4b"
+    # record outputs from un-sharded un-tensorized model
+    base_model = vllm_runner(
+        model_ref,
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
+    )
+    outputs = base_model.generate(prompts, sampling_params)
+
+    base_model.model.llm_engine.model_executor.shutdown()
+    del base_model
+    cleanup()
+    ray.shutdown()
+
+    # load model with two shards and serialize with encryption
+    model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
+    key_path = tmp_path / (model_ref + ".key")
+
+    tensorizer_config = TensorizerConfig(
+        tensorizer_uri=model_path,
+        encryption_keyfile=key_path,
+    )
+
+    tensorize_vllm_model(
+        engine_args=EngineArgs(
+                model=model_ref,
+                tensor_parallel_size=2,
+                disable_custom_all_reduce=True,
+                enforce_eager=True,
+            ),
+        tensorizer_config=tensorizer_config,
+    )
+    assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
+    assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
+    cleanup()
+    ray.shutdown()
+
+    loaded_vllm_model = vllm_runner(
+        model_ref,
+        tensor_parallel_size=2,
+        load_format="tensorizer",
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
+        model_loader_extra_config=tensorizer_config)
+
+    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+
+    assert outputs == deserialized_outputs
+
 
 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
     model_ref = "facebook/opt-125m"
@@ -248,7 +329,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
 
     with vllm_runner(model_ref) as vllm_model:
         outputs = vllm_model.generate(prompts, sampling_params)
-        serialize_vllm_model(vllm_model.model.llm_engine, config)
+        serialize_vllm_model(get_torch_model(vllm_model), config)
 
         assert is_vllm_tensorized(config)
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index f4c3dcbac..06de2fcc1 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.model_loader.tensorizer import (
     TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
-    tensorizer_weights_iterator)
+    serialize_vllm_model, tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -392,6 +392,12 @@ class TensorizerLoader(BaseModelLoader):
                    cache_config: CacheConfig) -> nn.Module:
         self._verify_config(model_config, parallel_config)
 
+        if parallel_config.tensor_parallel_size > 1:
+            from vllm.distributed import get_tensor_model_parallel_rank
+            self.tensorizer_config.tensorizer_uri = \
+                self.tensorizer_config.tensorizer_uri \
+                    % get_tensor_model_parallel_rank()
+
         if is_vllm_tensorized(self.tensorizer_config):
             return self._load_model_serialized(model_config, device_config,
                                                lora_config,
@@ -402,6 +408,16 @@ class TensorizerLoader(BaseModelLoader):
                                                vision_language_config,
                                                cache_config)
 
+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        serialize_vllm_model(
+            model=model,
+            tensorizer_config=tensorizer_config,
+        )
+
 
 class ShardedStateLoader(BaseModelLoader):
     """
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 2cf4ce5f8..d79fedaea 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -2,11 +2,11 @@ import argparse
 import dataclasses
 import io
 import os
+import re
 import time
-import typing
 from dataclasses import dataclass
 from functools import partial
-from typing import Generator, Optional, Tuple, Type, Union
+from typing import BinaryIO, Generator, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -14,6 +14,7 @@ from transformers import PretrainedConfig
 
 import vllm.envs as envs
 from vllm.config import ModelConfig, ParallelConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
@@ -48,8 +49,7 @@ logger = init_logger(__name__)
 
 @dataclass
 class TensorizerConfig:
-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
-                          str, bytes, os.PathLike, int]
+    tensorizer_uri: str
     vllm_tensorized: Optional[bool] = False
     verify_hash: Optional[bool] = False
     num_readers: Optional[int] = None
@@ -60,6 +60,12 @@ class TensorizerConfig:
     model_class: Optional[Type[torch.nn.Module]] = None
     hf_config: Optional[PretrainedConfig] = None
     dtype: Optional[Union[str, torch.dtype]] = None
+    _is_sharded: bool = False
+
+    def __post_init__(self):
+        # check if the configuration is for a sharded vLLM model
+        self._is_sharded = isinstance(self.tensorizer_uri, str) \
+            and re.search(r'%0\dd', self.tensorizer_uri) is not None
 
     def _construct_tensorizer_args(self) -> "TensorizerArgs":
         tensorizer_args = {
@@ -78,13 +84,12 @@ class TensorizerConfig:
         self,
         parallel_config: "ParallelConfig",
     ) -> None:
-        if (parallel_config.tensor_parallel_size > 1
-                and self.tensorizer_uri is not None):
+        if parallel_config.tensor_parallel_size > 1 \
+            and not self._is_sharded:
             raise ValueError(
-                "Loading to multiple GPUs is not currently supported with "
-                "vLLM-serialized models. Please set tensor_parallel_size=1."
-                " or use a non-vLLM-serialized model, such as a "
-                "serialized Hugging Face `PretrainedModel`.")
+                "For a sharded model, tensorizer_uri should include a"
+                " string format template like '%04d' to be formatted"
+                " with the rank of the shard")
 
     def verify_with_model_config(self, model_config: "ModelConfig") -> None:
         if (model_config.quantization is not None
@@ -102,8 +107,8 @@ def load_with_tensorizer(tensorizer_config: TensorizerConfig,
 
 @dataclass
 class TensorizerArgs:
-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
-                          str, bytes, os.PathLike, int]
+    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
+                          bytes, os.PathLike, int]
     vllm_tensorized: Optional[bool] = False
     verify_hash: Optional[bool] = False
     num_readers: Optional[int] = None
@@ -332,6 +337,7 @@ class TensorizerAgent:
         ) as stream, TensorDeserializer(
                 stream,
                 dtype=self.tensorizer_config.dtype,
+                device=f'cuda:{torch.cuda.current_device()}',
                 **self.tensorizer_args.deserializer_params) as deserializer:
             deserializer.load_into_module(self.model)
             end = time.perf_counter()
@@ -400,33 +406,70 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
     return False
 
 
-def get_pretensorized_vllm_model(engine: "LLMEngine") -> nn.Module:
-    model = (engine.model_executor.driver_worker.model_runner.model)
+def serialize_vllm_model(
+    model: nn.Module,
+    tensorizer_config: TensorizerConfig,
+) -> nn.Module:
     model.register_parameter(
         "vllm_tensorized_marker",
         nn.Parameter(torch.tensor((1, ), device="meta"), requires_grad=False))
-    return model
-
-
-def serialize_vllm_model(engine: "LLMEngine",
-                         tensorizer_config : TensorizerConfig,
-                         encryption_key_path: Optional[str] = None) \
-        -> nn.Module:
-
-    model = get_pretensorized_vllm_model(engine)
     tensorizer_args = tensorizer_config._construct_tensorizer_args()
+
     encryption_params = None
-    if encryption_key_path is not None:
-        encryption_params = EncryptionParams.random()
-        with _write_stream(encryption_key_path,
-                           **tensorizer_args.stream_params) as stream:
-            stream.write(encryption_params.key)
+    if (keyfile := tensorizer_config.encryption_keyfile) is not None:
+        with open(keyfile, "rb") as f:
+            key = f.read()
+        encryption_params = EncryptionParams(key=key)
 
-    with _write_stream(tensorizer_args.tensorizer_uri,
-                       **tensorizer_args.stream_params) as stream:
+    output_file = tensorizer_args.tensorizer_uri
+    if tensorizer_config._is_sharded:
+        from vllm.distributed import get_tensor_model_parallel_rank
+        output_file = output_file % get_tensor_model_parallel_rank()
+
+    with _write_stream(output_file, **tensorizer_args.stream_params) as stream:
         serializer = TensorSerializer(stream, encryption=encryption_params)
         serializer.write_module(model)
         serializer.close()
-    logger.info("Successfully serialized model to %s",
-                str(tensorizer_args.tensorizer_uri))
+    logger.info("Successfully serialized model to %s", str(output_file))
     return model
+
+
+def tensorize_vllm_model(engine_args: EngineArgs,
+                         tensorizer_config: TensorizerConfig,
+                         generate_keyfile: bool = True):
+    """Utility to load a model and then serialize it with Tensorizer
+
+       Intended to be used separately from running a vLLM server since it
+       creates its own Engine instance.
+    """
+    engine_config = engine_args.create_engine_config()
+    tensorizer_config.verify_with_model_config(engine_config.model_config)
+    tensorizer_config.verify_with_parallel_config(
+        engine_config.parallel_config)
+
+    # generate the encryption key before creating the engine to support sharding
+    if generate_keyfile and (keyfile :=
+                             tensorizer_config.encryption_keyfile) is not None:
+        encryption_params = EncryptionParams.random()
+        with _write_stream(
+                keyfile,
+                s3_access_key_id=tensorizer_config.s3_access_key_id,
+                s3_secret_access_key=tensorizer_config.s3_secret_access_key,
+                s3_endpoint=tensorizer_config.s3_endpoint,
+        ) as stream:
+            stream.write(encryption_params.key)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    if tensorizer_config._is_sharded:
+        # if the engine is a distributed engine (for tensor parallel) then each
+        # worker shard needs to serialize its part of the model.
+        engine.model_executor._run_workers(
+            "save_tensorized_model",
+            tensorizer_config=tensorizer_config,
+        )
+    else:
+        # with a single worker, we can get to the underlying model directly
+        serialize_vllm_model(
+            engine.model_executor.driver_worker.model_runner.model,
+            tensorizer_config,
+        )
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 99b12293a..de616ef1d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -20,6 +20,7 @@ from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
@@ -222,6 +223,16 @@ class ModelRunner:
             max_size=max_size,
         )
 
+    def save_tensorized_model(
+        self,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        from vllm.model_executor.model_loader.loader import TensorizerLoader
+        TensorizerLoader.save_model(
+            self.model,
+            tensorizer_config=tensorizer_config,
+        )
+
     def get_max_block_per_batch(self) -> int:
         block_size = self.block_size
         return (self.max_seq_len_to_capture + block_size - 1) // block_size
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 10411a2bf..7a378a862 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -15,6 +15,7 @@ from vllm.distributed import (broadcast_tensor_dict,
                               set_custom_all_reduce)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
@@ -132,6 +133,13 @@ class Worker(WorkerBase):
             max_size=max_size,
         )
 
+    def save_tensorized_model(
+        self,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        self.model_runner.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
+
     @torch.inference_mode()
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Profiles the peak memory usage of the model to determine how many
-- 
GitLab


From 622d45128c02e5296e1177481c65199754eab396 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 12 Jun 2024 14:46:35 -0700
Subject: [PATCH 010/376] [misc] add hint for AttributeError (#5462)

---
 vllm/_custom_ops.py | 48 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 440b0e8af..955086be1 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,13 +1,16 @@
 import contextlib
+import functools
 from typing import List, Optional, Tuple, Type
 
 import torch
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 try:
     import vllm._C
 except ImportError as e:
-    from vllm.logger import init_logger
-    logger = init_logger(__name__)
     logger.warning("Failed to import from vllm._C with %r", e)
 
 with contextlib.suppress(ImportError):
@@ -23,6 +26,25 @@ def is_custom_op_supported(op_name: str) -> bool:
     return op is not None
 
 
+def hint_on_error(fn):
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except AttributeError as e:
+            msg = (
+                "Error in calling custom op %s: %s\n"
+                "Possibly you have built or installed an obsolete version of vllm.\n"
+                "Please try a clean build and install of vllm,"
+                "or remove old built files such as vllm/*cpython*.so and build/ ."
+            )
+            logger.error(msg, fn.__name__, e)
+            raise e
+
+    return wrapper
+
+
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     torch.ops._C.silu_and_mul(out, x)
@@ -459,3 +481,25 @@ def dispatch_bgmv_low_level(
         h_out,
         y_offset,
     )
+
+
+# temporary fix for https://github.com/vllm-project/vllm/issues/5456
+# TODO: remove this in v0.6.0
+names_and_values = globals()
+names_and_values_to_update = {}
+# prepare variables to avoid dict size change during iteration
+k, v, arg = None, None, None
+fn_type = type(lambda x: x)
+for k, v in names_and_values.items():
+    # find functions that are defined in this file and have torch.Tensor
+    # in their annotations. `arg == "torch.Tensor"` is used to handle
+    # the case when users use `import __annotations__` to turn type
+    # hints into strings.
+    if isinstance(v, fn_type) \
+        and v.__code__.co_filename == __file__ \
+        and any(arg is torch.Tensor or arg == "torch.Tensor"
+                   for arg in v.__annotations__.values()):
+        names_and_values_to_update[k] = hint_on_error(v)
+
+names_and_values.update(names_and_values_to_update)
+del names_and_values_to_update, names_and_values, v, k, fn_type
-- 
GitLab


From b8d4dfff9c29ad6e02bce1fc79c089120b2d34d6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 13 Jun 2024 05:49:31 +0800
Subject: [PATCH 011/376] [Doc] Update debug docs (#5438)

---
 docs/source/getting_started/debugging.rst | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 3e4d0362e..ff37f4e62 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -8,27 +8,30 @@ Debugging hang/crash issues
 
 When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time:
 
-- Downloading a model: do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
-- Loading the model from disk: if the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
-- Tensor parallel inference: if the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
+- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
+- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
-If you already take care of the above issues, and the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
+If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
 
 - Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
 - Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble.
 - Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
-- Set the environment variable ``export VLLM_TRACE_FUNCTION=1`` . All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. **Note: it will generate a lot of logs and slow down the system. Only use it for debugging purposes.**
+- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
+
+  .. warning::
+    vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes.
 
 With more logging, hopefully you can find the root cause of the issue.
 
 Here are some common issues that can cause hangs:
 
-- The network setup is incorrect. The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
-- Hardware/driver setup is incorrect. GPU communication cannot be established. You can run a sanity check script below to see if the GPU communication is working correctly.
+- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
+- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly.
 
 .. code-block:: python
 
-    # save it as `test.py`` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
+    # save it as `test.py` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
     # adjust `--nproc-per-node` to the number of GPUs you want to use.
     import torch
     import torch.distributed as dist
@@ -39,4 +42,4 @@ Here are some common issues that can cause hangs:
     value = data.mean().item()
     assert value == dist.get_world_size()
 
-If the problem persists, feel free to open an `issue <https://github.com/vllm-project/vllm/issues/new/choose>`_ on GitHub, with a detailed description of the issue, your environment, and the logs.
+If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
-- 
GitLab


From 94a07bbdd813a0121d01a852ab03fb2430e73548 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 12 Jun 2024 17:59:44 -0400
Subject: [PATCH 012/376] [Bugfix] Fix typo in scheduler.py (requeset ->
 request) (#5470)

---
 vllm/core/scheduler.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index bb37c5f31..48c34625c 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -50,8 +50,8 @@ class SchedulingBudget:
     """
     token_budget: int
     max_num_seqs: int
-    _requeset_ids_num_batched_tokens: Set[str] = field(default_factory=set)
-    _requeset_ids_num_curr_seqs: Set[str] = field(default_factory=set)
+    _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
+    _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
     _num_batched_tokens: int = 0
     _num_curr_seqs: int = 0
 
@@ -65,28 +65,28 @@ class SchedulingBudget:
         return self.token_budget - self.num_batched_tokens
 
     def add_num_batched_tokens(self, req_id: str, num_batched_tokens: int):
-        if req_id in self._requeset_ids_num_batched_tokens:
+        if req_id in self._request_ids_num_batched_tokens:
             return
 
-        self._requeset_ids_num_batched_tokens.add(req_id)
+        self._request_ids_num_batched_tokens.add(req_id)
         self._num_batched_tokens += num_batched_tokens
 
     def subtract_num_batched_tokens(self, req_id: str,
                                     num_batched_tokens: int):
-        if req_id in self._requeset_ids_num_batched_tokens:
-            self._requeset_ids_num_batched_tokens.remove(req_id)
+        if req_id in self._request_ids_num_batched_tokens:
+            self._request_ids_num_batched_tokens.remove(req_id)
             self._num_batched_tokens -= num_batched_tokens
 
     def add_num_seqs(self, req_id: str, num_curr_seqs: int):
-        if req_id in self._requeset_ids_num_curr_seqs:
+        if req_id in self._request_ids_num_curr_seqs:
             return
 
-        self._requeset_ids_num_curr_seqs.add(req_id)
+        self._request_ids_num_curr_seqs.add(req_id)
         self._num_curr_seqs += num_curr_seqs
 
     def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
-        if req_id in self._requeset_ids_num_curr_seqs:
-            self._requeset_ids_num_curr_seqs.remove(req_id)
+        if req_id in self._request_ids_num_curr_seqs:
+            self._request_ids_num_curr_seqs.remove(req_id)
             self._num_curr_seqs -= num_curr_seqs
 
     @property
-- 
GitLab


From 7d19de2e9c9a94658c36b55011b803a7991d0335 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 12 Jun 2024 18:42:12 -0400
Subject: [PATCH 013/376] [Frontend] Add "input speed" to tqdm postfix
 alongside output speed (#5425)

---
 vllm/entrypoints/llm.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 411d5256b..9e9234931 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -545,11 +545,13 @@ class LLM:
                 total=num_requests,
                 desc="Processed prompts",
                 dynamic_ncols=True,
-                postfix=f"Generation Speed: {0:.2f} toks/s",
+                postfix=(f"est. speed input: {0:.2f} toks/s, "
+                         f"output: {0:.2f} toks/s"),
             )
         # Run the engine.
         outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
-        total_toks = 0
+        total_in_toks = 0
+        total_out_toks = 0
         while self.llm_engine.has_unfinished_requests():
             step_outputs = self.llm_engine.step()
             for output in step_outputs:
@@ -558,10 +560,15 @@ class LLM:
                     if use_tqdm:
                         if isinstance(output, RequestOutput):
                             # Calculate tokens only for RequestOutput
-                            total_toks += sum(
+                            total_in_toks += len(output.prompt_token_ids)
+                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
+                            total_out_toks += sum(
                                 len(stp.token_ids) for stp in output.outputs)
-                            spd = total_toks / pbar.format_dict["elapsed"]
-                            pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
+                            out_spd = total_out_toks / pbar.format_dict[
+                                "elapsed"]
+                            pbar.postfix = (
+                                f"est. speed input: {in_spd:.2f} toks/s, "
+                                f"output: {out_spd:.2f} toks/s")
                         pbar.update(1)
         if use_tqdm:
             pbar.close()
-- 
GitLab


From 2135cacb457b7daf1143c8465ab72650eaa4dd7e Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 13 Jun 2024 07:20:18 +0800
Subject: [PATCH 014/376] [Bugfix] Fix wrong multi_modal_input format for CPU
 runner (#5451)

---
 vllm/worker/cpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index eaf43247d..d539f5693 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -343,8 +343,8 @@ class CPUModelRunner:
             "kv_caches": kv_caches,
             "attn_metadata": attn_metadata,
         }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
+        if self.vision_language_config and multi_modal_input is not None:
+            execute_model_kwargs.update(multi_modal_input)
 
         hidden_states = model_executable(**execute_model_kwargs)
 
-- 
GitLab


From ea3890a5f0314e49d69afca45fe706504cb14029 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 12 Jun 2024 17:27:08 -0700
Subject: [PATCH 015/376] [Core][Distributed] code deduplication in tp&pp with
 coordinator(#5293)

[Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293)
---
 tests/conftest.py                             |   4 +-
 tests/distributed/test_custom_all_reduce.py   |   6 +-
 tests/distributed/test_pynccl.py              |  12 +-
 tests/lora/conftest.py                        |  23 +-
 tests/worker/test_model_runner.py             |   4 +-
 vllm/attention/backends/pallas.py             |   2 +-
 vllm/distributed/communication_op.py          | 311 +------
 .../device_communicators/custom_all_reduce.py |  13 +-
 .../custom_all_reduce_utils.py                |   7 +-
 .../device_communicators/pynccl.py            |  11 +-
 vllm/distributed/parallel_state.py            | 809 ++++++++++++------
 vllm/worker/model_runner.py                   |   2 +-
 12 files changed, 622 insertions(+), 582 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index e0680467d..29a4f126f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,7 +15,8 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
 
 from vllm import LLM, SamplingParams
 from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
-from vllm.distributed import destroy_model_parallel
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalData
@@ -54,6 +55,7 @@ def _read_prompts(filename: str) -> List[str]:
 
 def cleanup():
     destroy_model_parallel()
+    destroy_distributed_environment()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 186f9faa6..3776c1f91 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -7,9 +7,9 @@ import torch
 import torch.distributed as dist
 
 from vllm.distributed.communication_op import (  # noqa
-    graph_capture, tensor_model_parallel_all_reduce)
+    tensor_model_parallel_all_reduce)
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
-                                             get_tp_ca_communicator)
+                                             get_tp_group, graph_capture)
 
 from ..utils import (init_test_distributed_environment,
                      multi_process_tensor_parallel)
@@ -91,7 +91,7 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
     # communicate independently
     num_communication = rank // tp_size + 1
     sz = 1024
-    fa = get_tp_ca_communicator()
+    fa = get_tp_group().ca_comm
     inp = torch.ones(sz, dtype=torch.float32, device=device)
     out = inp
     for _ in range(num_communication):
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 0218295a3..b788e253a 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -6,10 +6,11 @@ import torch
 import torch.distributed
 
 from vllm.distributed.communication_op import (  # noqa
-    graph_capture, tensor_model_parallel_all_reduce)
+    tensor_model_parallel_all_reduce)
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_world_group, graph_capture,
                                              init_distributed_environment)
 from vllm.utils import update_environment_variables
 
@@ -53,7 +54,8 @@ def worker_fn_wrapper(fn):
 
 @worker_fn_wrapper
 def worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
     tensor = torch.ones(16, 1024, 1024,
                         dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
@@ -129,7 +131,8 @@ def test_pynccl_multiple_allreduce_with_vllm():
 def worker_fn_with_cudagraph():
     with torch.no_grad():
         graph = torch.cuda.CUDAGraph()
-        pynccl_comm = PyNcclCommunicator()
+        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                         device=get_world_group().device)
         # run something in the default stream to initialize torch engine
         a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
         torch.cuda.synchronize()
@@ -154,7 +157,8 @@ def test_pynccl_with_cudagraph():
 
 @worker_fn_wrapper
 def send_recv_worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
     if pynccl_comm.rank == 0:
         tensor = torch.ones(16, 1024, 1024,
                             dtype=torch.float32).cuda(pynccl_comm.rank)
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 400333066..522c635b8 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download
 
 import vllm
 from vllm.config import LoRAConfig
-from vllm.distributed import destroy_model_parallel, initialize_model_parallel
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel,
+                              init_distributed_environment,
+                              initialize_model_parallel)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                RowParallelLinear)
@@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{
 
 def cleanup():
     destroy_model_parallel()
+    destroy_distributed_environment()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
@@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 
 @pytest.fixture
 def dist_init():
-    if not torch.distributed.is_initialized():
-        temp_file = tempfile.mkstemp()[1]
-        torch.distributed.init_process_group(
-            backend="nccl",
-            world_size=1,
-            rank=0,
-            init_method=f"file://{temp_file}",
-        )
-        torch.distributed.all_reduce(torch.zeros(1).cuda())
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
     initialize_model_parallel(1, 1)
     yield
     cleanup()
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 92de545ac..514a57e17 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,7 +1,8 @@
 import pytest
 import torch
 
-from vllm.distributed.parallel_state import init_distributed_environment
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@@ -292,6 +293,7 @@ def distributed_init():
         rank=0,
         distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
         local_rank=0)
+    ensure_model_parallel_initialized(1, 1)
 
 
 @pytest.mark.parametrize("batch_size", list(range(2, 128)))
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index b203c5ec5..75f246526 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -110,7 +110,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
             raise NotImplementedError("TPU version must be 4 or higher.")
 
         self.megacore_mode = None
-        tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
+        tpu_type = torch_xla.tpu.get_tp_groupu_env()["TYPE"].lower()
         if not tpu_type.endswith("lite"):
             if self.num_kv_heads % 2 == 0:
                 self.megacore_mode = "kv_head"
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 2b38ec472..32394a07b 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,317 +1,32 @@
-from collections import namedtuple
-from contextlib import contextmanager, nullcontext
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union
 
 import torch
-from torch.distributed import ProcessGroup
+import torch.distributed
 
-from .parallel_state import (get_cpu_world_group, get_pp_pynccl_communicator,
-                             get_tensor_model_parallel_group,
-                             get_tensor_model_parallel_rank,
-                             get_tensor_model_parallel_world_size,
-                             get_tp_ca_communicator,
-                             get_tp_pynccl_communicator)
-
-
-@dataclass
-class GraphCaptureContext:
-    stream: torch.cuda.Stream
-
-
-@contextmanager
-def graph_capture():
-    """
-    `graph_capture` is a context manager which should surround the code that
-    is capturing the CUDA graph. Its main purpose is to ensure that the
-    some operations will be run after the graph is captured, before the graph
-    is replayed. It returns a `GraphCaptureContext` object which contains the
-    necessary data for the graph capture. Currently, it only contains the
-    stream that the graph capture is running on. This stream is set to the
-    current CUDA stream when the context manager is entered and reset to the
-    default stream when the context manager is exited. This is to ensure that
-    the graph capture is running on a separate stream from the default stream,
-    in order to explicitly distinguish the kernels to capture
-    from other kernels possibly launched on background in the default stream.
-    """
-    stream = torch.cuda.Stream()
-    graph_capture_context = GraphCaptureContext(stream)
-    ca_comm = get_tp_ca_communicator()
-    maybe_ca_context = nullcontext() if ca_comm is None else ca_comm.capture()
-    with torch.cuda.stream(stream), maybe_ca_context:
-        # In graph mode, we have to be very careful about the collective
-        # operations. The current status is:
-        #     allreduce \ Mode   |  Eager  |  Graph  |
-        # --------------------------------------------
-        # custom allreduce       | enabled | enabled |
-        # PyNccl                 | disabled| enabled |
-        # torch.distributed      | enabled | disabled|
-        #
-        # Note that custom allreduce will have a runtime check, if the tensor
-        #  size is too large, it will fallback to the next available option.
-        # In summary: When using CUDA graph, we use
-        # either custom all-reduce kernel or pynccl. When not using CUDA
-        # graph, we use either custom all-reduce kernel or PyTorch NCCL.
-        # We always prioritize using custom all-reduce kernel but fall back
-        # to PyTorch or pynccl if it is disabled or not supported.
-        tp_pynccl_comm = get_tp_pynccl_communicator()
-        pp_pynccl_comm = get_pp_pynccl_communicator()
-        if not tp_pynccl_comm:
-            maybe_tp_pynccl_context = nullcontext()
-        else:
-            maybe_tp_pynccl_context = tp_pynccl_comm.change_state(
-                enable=True, stream=torch.cuda.current_stream())
-        if not pp_pynccl_comm:
-            maybe_pp_pynccl_context = nullcontext()
-        else:
-            maybe_pp_pynccl_context = pp_pynccl_comm.change_state(
-                enable=True, stream=torch.cuda.current_stream())
-        with maybe_tp_pynccl_context, maybe_pp_pynccl_context:
-            yield graph_capture_context
+from .parallel_state import get_tp_group
 
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
-    """All-reduce the input tensor across model parallel group.
-
-    NOTE: This operation will be applied in-place on the input tensor if
-    disable_custom_all_reduce is set to True. Otherwise, this operation may or
-    may not be applied in place depending on whether custom all reduce is
-    invoked for a particular tensor, which further depends on the tensor size
-    and GPU topology.
-
-    TLDR: always assume this function modifies its input, but use the return
-    value as the output.
-    """
-    ca_comm = get_tp_ca_communicator()
-
-    # Bypass the function if we are using only 1 GPU.
-    if get_tensor_model_parallel_world_size() == 1:
-        return input_
-    if ca_comm is not None:
-        out = ca_comm.custom_all_reduce(input_)
-        if out is not None:
-            return out
-    pynccl_comm = get_tp_pynccl_communicator()
-    if (pynccl_comm is not None and not pynccl_comm.disabled):
-        pynccl_comm.all_reduce(input_)
-    else:
-        torch.distributed.all_reduce(input_,
-                                     group=get_tensor_model_parallel_group())
-    return input_
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group().all_reduce(input_)
 
 
 def tensor_model_parallel_all_gather(input_: torch.Tensor,
                                      dim: int = -1) -> torch.Tensor:
     """All-gather the input tensor across model parallel group."""
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-    assert -input_.dim() <= dim < input_.dim(), (
-        f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
-    if dim < 0:
-        # Convert negative dim to positive.
-        dim += input_.dim()
-    input_size = input_.size()
-    # Allocate output tensor.
-    output_tensor = torch.empty((world_size, ) + input_size,
-                                dtype=input_.dtype,
-                                device=input_.device)
-    # All-gather.
-    torch.distributed.all_gather_into_tensor(
-        output_tensor, input_, group=get_tensor_model_parallel_group())
-    # Reshape
-    output_tensor = output_tensor.movedim(0, dim)
-    output_tensor = output_tensor.reshape(input_size[:dim] +
-                                          (world_size * input_size[dim], ) +
-                                          input_size[dim + 1:])
-    return output_tensor
+    return get_tp_group().all_gather(input_, dim)
 
 
 def tensor_model_parallel_gather(input_: torch.Tensor,
                                  dst: int = 0,
                                  dim: int = -1) -> torch.Tensor:
-    """Gather the input tensor across model parallel group.
-
-    NOTE: We assume that the input tensor is on the same device across
-    all the ranks.
-    """
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-    assert -input_.dim() <= dim < input_.dim(), (
-        f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
-    if dim < 0:
-        # Convert negative dim to positive.
-        dim += input_.dim()
-    # Allocate output tensor.
-    if get_tensor_model_parallel_rank() == dst:
-        gather_list = [torch.empty_like(input_) for _ in range(world_size)]
-    else:
-        gather_list = None
-    # Gather.
-    torch.distributed.gather(input_,
-                             gather_list,
-                             dst=dst,
-                             group=get_tensor_model_parallel_group())
-    if get_tensor_model_parallel_rank() == dst:
-        output_tensor = torch.cat(gather_list, dim=dim)
-    else:
-        output_tensor = None
-    return output_tensor
-
-
-def broadcast(input_: torch.Tensor,
-              src: int = 0,
-              group: Optional[ProcessGroup] = None):
-    """Broadcast the input tensor."""
-    group = group or torch.distributed.group.WORLD
-    ranks = torch.distributed.get_process_group_ranks(group)
-    assert src in ranks, f"Invalid src rank ({src})"
-
-    # Bypass the function if we are using only 1 GPU.
-    world_size = torch.distributed.get_world_size(group=group)
-    if world_size == 1:
-        return input_
-    # Broadcast.
-    torch.distributed.broadcast(input_, src=src, group=group)
-    return input_
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group().gather(input_, dst, dim)
 
 
-def broadcast_object_list(obj_list: List[Any],
-                          src: int = 0,
-                          group: Optional[ProcessGroup] = None):
-    """Broadcast the input object list."""
-    group = group or torch.distributed.group.WORLD
-    ranks = torch.distributed.get_process_group_ranks(group)
-    assert src in ranks, f"Invalid src rank ({src})"
-
-    # Bypass the function if we are using only 1 GPU.
-    world_size = torch.distributed.get_world_size(group=group)
-    if world_size == 1:
-        return obj_list
-    # Broadcast.
-    torch.distributed.broadcast_object_list(obj_list, src=src, group=group)
-    return obj_list
-
-
-TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
-
-
-def _split_tensor_dict(
-    tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
-) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
-    """Split the tensor dictionary into two parts:
-    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
-         by its metadata.
-    2. A list of tensors.
-    """
-    metadata_list = []
-    tensor_list = []
-    for key, value in tensor_dict.items():
-        if isinstance(value, torch.Tensor):
-            # Note: we cannot use `value.device` here,
-            # because it contains not only the device type but also the device
-            # index (e.g. "cuda:0"). We only need the device type.
-            # receiving side will set the device index.
-            device = "cpu" if value.is_cpu else "cuda"
-            metadata_list.append(
-                (key, TensorMetadata(device, value.dtype, value.size())))
-            tensor_list.append(value)
-        else:
-            metadata_list.append((key, value))
-    return metadata_list, tensor_list
-
-
-def broadcast_tensor_dict(
-    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
-    src: int = 0,
-    group: Optional[ProcessGroup] = None,
-    metadata_group: Optional[ProcessGroup] = None
-) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
-    """Broadcast the input tensor dictionary.
-    `group` is used to broadcast the tensors, while `metadata_group` is used
-     to broadcast the metadata of the dict (e.g. dict structure, tensor sizes,
-     dtypes).
-    """
-    # Bypass the function if we are using only 1 GPU.
-    if (not torch.distributed.is_initialized()
-            or torch.distributed.get_world_size(group=group) == 1):
+def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor,
+                                                                Any]]] = None,
+                          src: int = 0):
+    if not torch.distributed.is_initialized():
         return tensor_dict
-
-    group = group or torch.distributed.group.WORLD
-    metadata_group = metadata_group or get_cpu_world_group()
-    ranks = torch.distributed.get_process_group_ranks(group)
-    assert src in ranks, f"Invalid src rank ({src})"
-
-    rank = torch.distributed.get_rank()
-    if rank == src:
-        metadata_list: List[Tuple[Any, Any]] = []
-        assert isinstance(
-            tensor_dict,
-            dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
-        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
-        # `metadata_list` lives in CPU memory.
-        # `broadcast_object_list` involves serialization and deserialization,
-        # all happening on CPU. Therefore, we can use the CPU group.
-        torch.distributed.broadcast_object_list([metadata_list],
-                                                src=src,
-                                                group=metadata_group)
-        async_handles = []
-        for tensor in tensor_list:
-            if tensor.numel() == 0:
-                # Skip broadcasting empty tensors.
-                continue
-            if tensor.is_cpu:
-                # use metadata_group for CPU tensors
-                handle = torch.distributed.broadcast(tensor,
-                                                     src=src,
-                                                     group=metadata_group,
-                                                     async_op=True)
-            else:
-                # use group for GPU tensors
-                handle = torch.distributed.broadcast(tensor,
-                                                     src=src,
-                                                     group=group,
-                                                     async_op=True)
-            async_handles.append(handle)
-        for async_handle in async_handles:
-            async_handle.wait()
-
-    else:
-        recv_metadata_list = [None]
-        torch.distributed.broadcast_object_list(recv_metadata_list,
-                                                src=src,
-                                                group=metadata_group)
-        assert recv_metadata_list[0] is not None
-        tensor_dict = {}
-        async_handles = []
-        for key, value in recv_metadata_list[0]:
-            if isinstance(value, TensorMetadata):
-                tensor = torch.empty(value.size,
-                                     dtype=value.dtype,
-                                     device=value.device)
-                if tensor.numel() == 0:
-                    # Skip broadcasting empty tensors.
-                    tensor_dict[key] = tensor
-                    continue
-                if tensor.is_cpu:
-                    # use metadata_group for CPU tensors
-                    handle = torch.distributed.broadcast(tensor,
-                                                         src=src,
-                                                         group=metadata_group,
-                                                         async_op=True)
-                else:
-                    # use group for GPU tensors
-                    handle = torch.distributed.broadcast(tensor,
-                                                         src=src,
-                                                         group=group,
-                                                         async_op=True)
-                async_handles.append(handle)
-                tensor_dict[key] = tensor
-            else:
-                tensor_dict[key] = value
-        for async_handle in async_handles:
-            async_handle.wait()
-    return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index bbc2284f8..9a2b47594 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -9,8 +9,7 @@ import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
-from vllm.distributed.parallel_state import (
-    get_local_rank, get_tensor_model_parallel_cpu_group, is_in_the_same_node)
+from vllm.distributed.parallel_state import is_in_the_same_node
 from vllm.logger import init_logger
 
 try:
@@ -86,8 +85,8 @@ class CustomAllreduce:
 
     # max_size: max supported allreduce size
     def __init__(self,
-                 group: Optional[ProcessGroup] = None,
-                 device: Optional[Union[int, str, torch.device]] = None,
+                 group: ProcessGroup,
+                 device: Union[int, str, torch.device],
                  max_size=8192 * 1024) -> None:
         """
         Args:
@@ -107,7 +106,6 @@ class CustomAllreduce:
             # e.g. in a non-cuda environment
             return
 
-        group = group or get_tensor_model_parallel_cpu_group()
         self.group = group
 
         assert dist.get_backend(group) != dist.Backend.NCCL, (
@@ -134,10 +132,7 @@ class CustomAllreduce:
                 world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES))
             return
 
-        if device is None:
-            local_rank = get_local_rank()
-            device = torch.device(f"cuda:{local_rank}")
-        elif isinstance(device, int):
+        if isinstance(device, int):
             device = torch.device(f"cuda:{device}")
         elif isinstance(device, str):
             device = torch.device(device)
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 4b89a23df..1fd0058f6 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -11,7 +11,6 @@ import torch.distributed as dist
 import torch.multiprocessing as mp
 
 import vllm.envs as envs
-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -162,7 +161,8 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
         f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
     )
     os.makedirs(os.path.dirname(path), exist_ok=True)
-    if ((not is_distributed or get_local_rank() == 0)
+    from vllm.distributed.parallel_state import get_world_group
+    if ((not is_distributed or get_world_group().local_rank == 0)
             and (not os.path.exists(path))):
         # only the local master process (with local_rank == 0) can
         #  enter this block to calculate the cache
@@ -174,8 +174,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
         with open(path, "w") as f:
             json.dump(cache, f, indent=4)
     if is_distributed:
-        cpu_world_group = get_cpu_world_group()
-        dist.barrier(cpu_world_group)
+        get_world_group().barrier()
     logger.info("reading GPU P2P access cache from %s", path)
     with open(path, "r") as f:
         cache = json.load(f)
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index f5f1de0c7..83eec264b 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -9,7 +9,6 @@ from torch.distributed import ProcessGroup, ReduceOp
 from vllm.distributed.device_communicators.pynccl_wrapper import (
     NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
     ncclRedOpTypeEnum, ncclUniqueId)
-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -19,8 +18,8 @@ class PyNcclCommunicator:
 
     def __init__(
         self,
-        group: Optional[ProcessGroup] = None,
-        device: Optional[Union[int, str, torch.device]] = None,
+        group: ProcessGroup,
+        device: Union[int, str, torch.device],
         library_path: Optional[str] = None,
     ):
         """
@@ -35,7 +34,6 @@ class PyNcclCommunicator:
         is bind to a unique device.
         """
         assert dist.is_initialized()
-        group = get_cpu_world_group() if group is None else group
         assert dist.get_backend(group) != dist.Backend.NCCL, (
             "PyNcclCommunicator should be attached to a non-NCCL group.")
         self.group = group
@@ -77,10 +75,7 @@ class PyNcclCommunicator:
         byte_list = tensor.tolist()
         for i, byte in enumerate(byte_list):
             self.unique_id.internal[i] = byte
-        if device is None:
-            local_rank = get_local_rank()
-            device = torch.device(f"cuda:{local_rank}")
-        elif isinstance(device, int):
+        if isinstance(device, int):
             device = torch.device(f"cuda:{device}")
         elif isinstance(device, str):
             device = torch.device(device)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b6d1eeff0..f6a2fc9b0 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -2,81 +2,518 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-"""Tensor and pipeline parallel groups."""
+"""vLLM distributed state.
+It takes over the control of the distributed environment from PyTorch.
+The typical workflow is:
+
+- call `init_distributed_environment` to initialize the distributed environment.
+- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to 
+ initialize the model parallel groups.
+
+- any code dealing with the distributed stuff
+
+- call `destroy_model_parallel` to destroy the model parallel groups.
+- call `destroy_distributed_environment` to destroy the distributed environment.
+
+If you only need to use the distributed environment without model/pipeline
+ parallelism, you can skip the model parallel initialization and destruction
+ steps.
+"""
 import contextlib
+from collections import namedtuple
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
 from multiprocessing import resource_tracker, shared_memory
-from typing import List, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
-from torch.distributed import ProcessGroup
+from torch.distributed import Backend, ProcessGroup
 
 import vllm.envs as envs
 from vllm.logger import init_logger
 
-logger = init_logger(__name__)
 
-_ENABLE_CUSTOM_ALL_REDUCE = True
+@dataclass
+class GraphCaptureContext:
+    stream: torch.cuda.Stream
 
-# Tensor model parallel group that the current rank belongs to.
-_TP_DEVICE_GROUP: Optional[ProcessGroup] = None
-_TP_CPU_GROUP: Optional[ProcessGroup] = None
-_TP_PYNCCL_COMMUNICATOR = None
-_TP_CA_COMMUNICATOR = None
-# Pipeline model parallel group that the current rank belongs to.
-_PP_DEVICE_GROUP: Optional[ProcessGroup] = None
-_PP_CPU_GROUP: Optional[ProcessGroup] = None
-_PP_PYNCCL_COMMUNICATOR = None
-
-# when people blindly call `torch.distributed.all_reduce` etc,
-# it will use this group. It is initialized with the `backend`
-# parameter of `init_distributed_environment` below.
-# Essentially, this is `torch.distributed.group.WORLD`.
-# We leave a line here to note that this is device-specific.
-# Note that this variable is not safe to use, because when users
-# call `init_distributed_environment` first, and then destroy
-# the process group themselves, this variable will keep a reference to the
-# destroyed process group, which is not useful.
-_DEVICE_WORLD_GROUP = None
-
-# duing `init_distributed_environment`, we will also initialize a
-# group with `gloo` backend, to allow direct coordination between
-# processes through the CPU.
-_CPU_WORLD_GROUP = None
-
-# In summary, after calling `init_distributed_environment`, we will
-# always have two groups: one for device-specific (and is the default)
-# and one for CPU. All processes will be part of both groups.
-
-# A list of global ranks for each pipeline group to ease calculation of the
-# source rank when broadcasting from the first or last pipeline stage.
-_PP_GLOBAL_RANKS: Optional[List[int]] = None
-
-_LOCAL_RANK = -1
 
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
 
-def set_custom_all_reduce(enable: bool):
-    global _ENABLE_CUSTOM_ALL_REDUCE
-    _ENABLE_CUSTOM_ALL_REDUCE = enable
 
+def _split_tensor_dict(
+    tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    """Split the tensor dictionary into two parts:
+    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+         by its metadata.
+    2. A list of tensors.
+    """
+    metadata_list = []
+    tensor_list = []
+    for key, value in tensor_dict.items():
+        if isinstance(value, torch.Tensor):
+            # Note: we cannot use `value.device` here,
+            # because it contains not only the device type but also the device
+            # index (e.g. "cuda:0"). We only need the device type.
+            # receiving side will set the device index.
+            device = "cpu" if value.is_cpu else "cuda"
+            metadata_list.append(
+                (key, TensorMetadata(device, value.dtype, value.size())))
+            tensor_list.append(value)
+        else:
+            metadata_list.append((key, value))
+    return metadata_list, tensor_list
 
-def get_pp_pynccl_communicator():
-    global _PP_PYNCCL_COMMUNICATOR
-    return _PP_PYNCCL_COMMUNICATOR
 
+class GroupCoordinator:
+    """
+    PyTorch ProcessGroup wrapper for a group of processes.
+    PyTorch ProcessGroup is bound to one specific communication backend,
+        e.g. NCCL, Gloo, MPI, etc.
+    GroupCoordinator takes charge of all the communication operations among
+        the processes in the group. It can route the communication to
+        a specific implementation (e.g. switch allreduce implementation
+        based on the tensor size and cuda graph mode).
+    """
 
-def get_tp_pynccl_communicator():
-    global _TP_PYNCCL_COMMUNICATOR
-    return _TP_PYNCCL_COMMUNICATOR
+    # available attributes:
+    rank: int  # global rank
+    ranks: List[int]  # global ranks in the group
+    world_size: int  # size of the group
+    # difference between `local_rank` and `rank_in_group`:
+    # if we have a group of size 4 across two nodes:
+    # Process | Node | Rank | Local Rank | Rank in Group
+    #   0     |   0  |  0   |     0      |       0
+    #   1     |   0  |  1   |     1      |       1
+    #   2     |   1  |  2   |     0      |       2
+    #   3     |   1  |  3   |     1      |       3
+    local_rank: int  # local rank used to assign devices
+    rank_in_group: int  # rank inside the group
+    cpu_group: ProcessGroup  # group for CPU communication
+    device_group: ProcessGroup  # group for device communication
+    use_pynccl: bool  # a hint of whether to use PyNccl
+    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
+    # communicators are only created for world size > 1
+    pynccl_comm: Optional[Any]  # PyNccl communicator
+    ca_comm: Optional[Any]  # Custom allreduce communicator
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_pynccl: bool,
+        use_custom_allreduce: bool,
+    ):
+
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+        self.cpu_group = None
+
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend)
+            # a group with `gloo` backend, to allow direct coordination between
+            # processes through the CPU.
+            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+                self.cpu_group = cpu_group
+
+        assert self.cpu_group is not None
+        assert self.device_group is not None
 
+        if torch.cuda.is_available():
+            self.device = torch.device(f"cuda:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
 
-def get_tp_ca_communicator():
-    global _TP_CA_COMMUNICATOR
-    return _TP_CA_COMMUNICATOR
+        self.use_pynccl = use_pynccl
+        self.use_custom_allreduce = use_custom_allreduce
+
+        # lazy import to avoid documentation build error
+        from vllm.distributed.device_communicators.custom_all_reduce import (
+            CustomAllreduce)
+        from vllm.distributed.device_communicators.pynccl import (
+            PyNcclCommunicator)
+
+        self.pynccl_comm: Optional[PyNcclCommunicator]
+        if use_pynccl and self.world_size > 1:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+        else:
+            self.pynccl_comm = None
+
+        self.ca_comm: Optional[CustomAllreduce]
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            self.ca_comm = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+            )
+        else:
+            self.ca_comm = None
+
+    @property
+    def first_rank(self):
+        """Return the global rank of the first process in the group"""
+        return self.ranks[0]
+
+    @property
+    def last_rank(self):
+        """Return the global rank of the last process in the group"""
+        return self.ranks[-1]
+
+    @property
+    def next_rank(self):
+        """Return the global rank of the process that follows the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group + 1) % world_size]
+
+    @property
+    def prev_rank(self):
+        """Return the global rank of the process that precedes the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group - 1) % world_size]
+
+    @contextmanager
+    def graph_capture(
+            self, graph_capture_context: Optional[GraphCaptureContext] = None):
+        if graph_capture_context is None:
+            stream = torch.cuda.Stream()
+            graph_capture_context = GraphCaptureContext(stream)
+        else:
+            stream = graph_capture_context.stream
+
+        ca_comm = self.ca_comm
+        maybe_ca_context = nullcontext(
+        ) if ca_comm is None else ca_comm.capture()
+        with torch.cuda.stream(stream), maybe_ca_context:
+            # In graph mode, we have to be very careful about the collective
+            # operations. The current status is:
+            #     allreduce \ Mode   |  Eager  |  Graph  |
+            # --------------------------------------------
+            # custom allreduce       | enabled | enabled |
+            # PyNccl                 | disabled| enabled |
+            # torch.distributed      | enabled | disabled|
+            #
+            # Note that custom allreduce will have a runtime check, if the
+            #  tensor size is too large, it will fallback to the next
+            #  available option.
+            # In summary: When using CUDA graph, we use
+            #  either custom all-reduce kernel or pynccl. When not using
+            #  CUDA graph, we use either custom all-reduce kernel or
+            #  PyTorch NCCL. We always prioritize using custom all-reduce
+            #  kernel but fall back to PyTorch or pynccl if it is
+            #  disabled or not supported.
+            pynccl_comm = self.pynccl_comm
+            maybe_pynccl_context: Any
+            if not pynccl_comm:
+                maybe_pynccl_context = nullcontext()
+            else:
+                maybe_pynccl_context = pynccl_comm.change_state(
+                    enable=True, stream=torch.cuda.current_stream())
+            with maybe_pynccl_context:
+                yield graph_capture_context
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        """
+        NOTE: This operation will be applied in-place or out-of-place. 
+        Always assume this function modifies its input, but use the return
+        value as the output.
+        """
+        ca_comm = self.ca_comm
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        if ca_comm is not None:
+            out = ca_comm.custom_all_reduce(input_)
+            if out is not None:
+                return out
+        pynccl_comm = self.pynccl_comm
+        if (pynccl_comm is not None and not pynccl_comm.disabled):
+            pynccl_comm.all_reduce(input_)
+        else:
+            torch.distributed.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((world_size, ) + input_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        torch.distributed.all_gather_into_tensor(output_tensor,
+                                                 input_,
+                                                 group=self.device_group)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> torch.Tensor:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(input_,
+                                 gather_list,
+                                 dst=self.ranks[dst],
+                                 group=self.device_group)
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0):
+        """Broadcast the input tensor.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        # Broadcast.
+        torch.distributed.broadcast(input_,
+                                    src=self.ranks[src],
+                                    group=self.device_group)
+        return input_
+
+    def broadcast_object_list(self,
+                              obj_list: List[Any],
+                              src: int = 0,
+                              group: Optional[ProcessGroup] = None):
+        """Broadcast the input object list.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj_list
+        # Broadcast.
+        torch.distributed.broadcast_object_list(obj_list,
+                                                src=self.ranks[src],
+                                                group=self.device_group)
+        return obj_list
+
+    def broadcast_tensor_dict(
+        self,
+        tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
+        src: int = 0,
+        group: Optional[ProcessGroup] = None,
+        metadata_group: Optional[ProcessGroup] = None
+    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+        """Broadcast the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if (not torch.distributed.is_initialized() or self.world_size == 1):
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        src = self.ranks[src]
+
+        rank = self.rank
+        if rank == src:
+            metadata_list: List[Tuple[Any, Any]] = []
+            assert isinstance(
+                tensor_dict,
+                dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+            # `metadata_list` lives in CPU memory.
+            # `broadcast_object_list` has serialization & deserialization,
+            # all happening on CPU. Therefore, we can use the CPU group.
+            torch.distributed.broadcast_object_list([metadata_list],
+                                                    src=src,
+                                                    group=metadata_group)
+            async_handles = []
+            for tensor in tensor_list:
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=src,
+                                                         group=metadata_group,
+                                                         async_op=True)
+                else:
+                    # use group for GPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=src,
+                                                         group=group,
+                                                         async_op=True)
+                async_handles.append(handle)
+            for async_handle in async_handles:
+                async_handle.wait()
+
+        else:
+            recv_metadata_list = [None]
+            torch.distributed.broadcast_object_list(recv_metadata_list,
+                                                    src=src,
+                                                    group=metadata_group)
+            assert recv_metadata_list[0] is not None
+            tensor_dict = {}
+            async_handles = []
+            for key, value in recv_metadata_list[0]:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(value.size,
+                                         dtype=value.dtype,
+                                         device=value.device)
+                    if tensor.numel() == 0:
+                        # Skip broadcasting empty tensors.
+                        tensor_dict[key] = tensor
+                        continue
+                    if tensor.is_cpu:
+                        # use metadata_group for CPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=src,
+                            group=metadata_group,
+                            async_op=True)
+                    else:
+                        # use group for GPU tensors
+                        handle = torch.distributed.broadcast(tensor,
+                                                             src=src,
+                                                             group=group,
+                                                             async_op=True)
+                    async_handles.append(handle)
+                    tensor_dict[key] = tensor
+                else:
+                    tensor_dict[key] = value
+            for async_handle in async_handles:
+                async_handle.wait()
+        return tensor_dict
+
+    def barrier(self):
+        """Barrier synchronization among the group.
+        NOTE: don't use `device_group` here! `barrier` in NCCL is
+        terrible because it is internally a broadcast operation with
+        secretly created GPU tensors. It is easy to mess up the current
+        device. Use the CPU group instead.
+        """
+        torch.distributed.barrier(group=self.cpu_group)
+
+    def destroy(self):
+        if self.device_group is not None:
+            torch.distributed.destroy_process_group(self.device_group)
+            self.device_group = None
+        if self.cpu_group is not None:
+            torch.distributed.destroy_process_group(self.cpu_group)
+            self.cpu_group = None
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
+
+
+_WORLD: Optional[GroupCoordinator] = None
+
+
+def get_world_group() -> GroupCoordinator:
+    assert _WORLD is not None, ("world group is not initialized")
+    return _WORLD
+
+
+_TP: Optional[GroupCoordinator] = None
+
+
+def get_tp_group() -> GroupCoordinator:
+    assert _TP is not None, ("tensor model parallel group is not initialized")
+    return _TP
+
+
+# kept for backward compatibility
+get_tensor_model_parallel_group = get_tp_group
+
+_PP: Optional[GroupCoordinator] = None
+
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, (
+        "pipeline model parallel group is not initialized")
+    return _PP
 
 
-def get_local_rank():
-    global _LOCAL_RANK
-    return _LOCAL_RANK
+# kept for backward compatibility
+get_pipeline_model_parallel_group = get_pp_group
+
+
+@contextmanager
+def graph_capture():
+    """
+    `graph_capture` is a context manager which should surround the code that
+    is capturing the CUDA graph. Its main purpose is to ensure that the
+    some operations will be run after the graph is captured, before the graph
+    is replayed. It returns a `GraphCaptureContext` object which contains the
+    necessary data for the graph capture. Currently, it only contains the
+    stream that the graph capture is running on. This stream is set to the
+    current CUDA stream when the context manager is entered and reset to the
+    default stream when the context manager is exited. This is to ensure that
+    the graph capture is running on a separate stream from the default stream,
+    in order to explicitly distinguish the kernels to capture
+    from other kernels possibly launched on background in the default stream.
+    """
+    with get_tp_group().graph_capture() as context, get_pp_group(
+    ).graph_capture(context):
+        yield context
+
+
+logger = init_logger(__name__)
+
+_ENABLE_CUSTOM_ALL_REDUCE = True
+
+
+def set_custom_all_reduce(enable: bool):
+    global _ENABLE_CUSTOM_ALL_REDUCE
+    _ENABLE_CUSTOM_ALL_REDUCE = enable
 
 
 def init_distributed_environment(
@@ -100,31 +537,29 @@ def init_distributed_environment(
             init_method=distributed_init_method,
             world_size=world_size,
             rank=rank)
-        global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP
-        _DEVICE_WORLD_GROUP = torch.distributed.group.WORLD
+    # set the local rank
+    # local_rank is not available in torch ProcessGroup,
+    # see https://github.com/pytorch/pytorch/issues/122816
+    if local_rank == -1:
+        # local rank not set, this usually happens in single-node
+        # setting, where we can use rank as local rank
+        if distributed_init_method == "env://":
+            local_rank = envs.LOCAL_RANK
+        else:
+            local_rank = rank
+    global _WORLD
+    if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
-        _CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks,
-                                                       backend="gloo")
-        # set the local rank
-        # local_rank is not available in torch ProcessGroup,
-        # see https://github.com/pytorch/pytorch/issues/122816
-        if local_rank == -1:
-            # local rank not set, this usually happens in single-node
-            # setting, where we can use rank as local rank
-            if distributed_init_method == "env://":
-                local_rank = envs.LOCAL_RANK
-            else:
-                local_rank = rank
-        global _LOCAL_RANK
-        _LOCAL_RANK = local_rank
-        # A small all_reduce for warmup.
-        data = torch.zeros(1)
-        if torch.cuda.is_available():
-            data = data.to(device=f"cuda:{local_rank}")
-        torch.distributed.all_reduce(data)
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        del data
+        _WORLD = GroupCoordinator(
+            group_ranks=[ranks],
+            local_rank=local_rank,
+            torch_distributed_backend=backend,
+            use_pynccl=False,
+            use_custom_allreduce=False,
+        )
+    else:
+        assert _WORLD.world_size == torch.distributed.get_world_size(), (
+            "world group already initialized with a different world size")
 
 
 def initialize_model_parallel(
@@ -157,8 +592,8 @@ def initialize_model_parallel(
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
-    # get the backend of _DEVICE_WORLD_GROUP
-    backend = backend or torch.distributed.get_backend()
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)
 
     if (world_size !=
             tensor_model_parallel_size * pipeline_model_parallel_size):
@@ -167,63 +602,42 @@ def initialize_model_parallel(
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
             f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
 
+    # Build the tensor model-parallel groups.
     num_tensor_model_parallel_groups: int = (world_size //
                                              tensor_model_parallel_size)
-    num_pipeline_model_parallel_groups: int = (world_size //
-                                               pipeline_model_parallel_size)
-    rank = torch.distributed.get_rank()
-
-    # Build the tensor model-parallel groups.
-    global _TP_DEVICE_GROUP, _TP_CPU_GROUP
-    global _TP_PYNCCL_COMMUNICATOR, _TP_CA_COMMUNICATOR
-    assert _TP_DEVICE_GROUP is None, (
-        "tensor model parallel group is already initialized")
+    global _TP
+    assert _TP is None, ("tensor model parallel group is already initialized")
+    group_ranks = []
     for i in range(num_tensor_model_parallel_groups):
         ranks = list(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
-        group = torch.distributed.new_group(ranks, backend=backend)
-        cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-        if rank in ranks:
-            _TP_DEVICE_GROUP = group
-            _TP_CPU_GROUP = cpu_group
-
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-    if tensor_model_parallel_size > 1:
-        _TP_PYNCCL_COMMUNICATOR = PyNcclCommunicator(
-            group=_TP_CPU_GROUP,
-            device=_LOCAL_RANK,
-        )
-
-    # Initialize a custom fast all-reduce implementation.
-    if _ENABLE_CUSTOM_ALL_REDUCE:
-        from vllm.distributed.device_communicators.custom_all_reduce import (
-            CustomAllreduce)
-        _TP_CA_COMMUNICATOR = CustomAllreduce(
-            group=_TP_CPU_GROUP,
-            device=_LOCAL_RANK,
-        )
+        group_ranks.append(ranks)
+    _TP = GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=get_world_group().local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+    )
 
     # Build the pipeline model-parallel groups.
-    global _PP_DEVICE_GROUP, _PP_CPU_GROUP
-    global _PP_PYNCCL_COMMUNICATOR
-    global _PP_GLOBAL_RANKS
-    assert _PP_DEVICE_GROUP is None, (
+    num_pipeline_model_parallel_groups: int = (world_size //
+                                               pipeline_model_parallel_size)
+    global _PP
+    assert _PP is None, (
         "pipeline model parallel group is already initialized")
+    group_ranks = []
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
-        group = torch.distributed.new_group(ranks, backend=backend)
-        cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-        if rank in ranks:
-            _PP_DEVICE_GROUP = group
-            _PP_CPU_GROUP = cpu_group
-            _PP_GLOBAL_RANKS = ranks
-
-    if pipeline_model_parallel_size > 1:
-        _PP_PYNCCL_COMMUNICATOR = PyNcclCommunicator(
-            group=_PP_CPU_GROUP,
-            device=_LOCAL_RANK,
-        )
+        group_ranks.append(ranks)
+    _PP = GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=get_world_group().local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+    )
 
 
 def ensure_model_parallel_initialized(
@@ -235,8 +649,8 @@ def ensure_model_parallel_initialized(
     or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
     values if the model parallel groups are initialized.
     """
-    # get the backend of _DEVICE_WORLD_GROUP
-    backend = backend or torch.distributed.get_backend()
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)
     if not model_parallel_is_initialized():
         initialize_model_parallel(tensor_model_parallel_size,
                                   pipeline_model_parallel_size, backend)
@@ -247,137 +661,48 @@ def ensure_model_parallel_initialized(
     ), ("tensor parallel group already initialized, but of unexpected size: "
         f"{get_tensor_model_parallel_world_size()=} vs. "
         f"{tensor_model_parallel_size=}")
-    assert (get_pipeline_model_parallel_world_size(
-    ) == pipeline_model_parallel_size), (
+    pp_world_size = get_pp_group().world_size
+    assert (pp_world_size == pipeline_model_parallel_size), (
         "pipeline parallel group already initialized, but of unexpected size: "
-        f"{get_pipeline_model_parallel_world_size()=} vs. "
+        f"{pp_world_size=} vs. "
         f"{pipeline_model_parallel_size=}")
 
 
 def model_parallel_is_initialized():
     """Check if tensor and pipeline parallel groups are initialized."""
-    return (_TP_DEVICE_GROUP is not None and _PP_DEVICE_GROUP is not None)
-
-
-def get_cpu_world_group():
-    """Get the CPU world group."""
-    assert _CPU_WORLD_GROUP is not None, ("CPU world group is not initialized")
-    return _CPU_WORLD_GROUP
-
-
-def get_tensor_model_parallel_group():
-    """Get the tensor model parallel group the caller rank belongs to."""
-    assert _TP_DEVICE_GROUP is not None, (
-        "tensor model parallel group is not initialized")
-    return _TP_DEVICE_GROUP
-
-
-def get_tensor_model_parallel_cpu_group():
-    """Get the tensor model parallel cpu group the caller rank belongs to."""
-    assert _TP_CPU_GROUP is not None, (
-        "tensor model parallel cpu group is not initialized")
-    return _TP_CPU_GROUP
-
-
-def get_pipeline_model_parallel_group():
-    """Get the pipeline model parallel group the caller rank belongs to."""
-    assert _PP_DEVICE_GROUP is not None, (
-        "pipeline model parallel group is not initialized")
-    return _PP_DEVICE_GROUP
-
-
-def get_pipeline_model_parallel_cpu_group():
-    """Get the pipeline model parallel cpu group the caller rank belongs to."""
-    assert _PP_CPU_GROUP is not None, (
-        "pipeline model parallel cpu group is not initialized")
-    return _PP_CPU_GROUP
+    return (_TP is not None and _PP is not None)
 
 
 def get_tensor_model_parallel_world_size():
     """Return world size for the tensor model parallel group."""
-    return torch.distributed.get_world_size(
-        group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_world_size():
-    """Return world size for the pipeline model parallel group."""
-    return torch.distributed.get_world_size(
-        group=get_pipeline_model_parallel_group())
+    return get_tp_group().world_size
 
 
 def get_tensor_model_parallel_rank():
     """Return my rank for the tensor model parallel group."""
-    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_rank():
-    """Return my rank for the pipeline model parallel group."""
-    return torch.distributed.get_rank(
-        group=get_pipeline_model_parallel_group())
-
-
-def get_tensor_model_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the tensor model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_tensor_model_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
-
-
-def get_pipeline_model_parallel_first_rank():
-    """Return the global rank of the first process in the pipeline for the
-    current tensor parallel group"""
-    assert _PP_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    return _PP_GLOBAL_RANKS[0]
-
-
-def get_pipeline_model_parallel_last_rank():
-    """Return the global rank of the last process in the pipeline for the
-    current tensor parallel group"""
-    assert _PP_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    last_rank_local = get_pipeline_model_parallel_world_size() - 1
-    return _PP_GLOBAL_RANKS[last_rank_local]
-
-
-def get_pipeline_model_parallel_next_rank():
-    """Return the global rank that follows the caller in the pipeline"""
-    assert _PP_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PP_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
-
-
-def get_pipeline_model_parallel_prev_rank():
-    """Return the global rank that precedes the caller in the pipeline"""
-    assert _PP_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PP_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+    return get_tp_group().rank_in_group
 
 
 def destroy_model_parallel():
     """Set the groups to none and destroy them."""
-    global _TP_DEVICE_GROUP
-    if _TP_DEVICE_GROUP:
-        torch.distributed.destroy_process_group(_TP_DEVICE_GROUP)
-    _TP_DEVICE_GROUP = None
-    global _TP_CPU_GROUP
-    if _TP_CPU_GROUP:
-        torch.distributed.destroy_process_group(_TP_CPU_GROUP)
-    _TP_CPU_GROUP = None
-    global _TP_PYNCCL_COMMUNICATOR
-    _TP_PYNCCL_COMMUNICATOR = None
-
-    global _PP_DEVICE_GROUP
-    if _PP_DEVICE_GROUP:
-        torch.distributed.destroy_process_group(_PP_DEVICE_GROUP)
-    _PP_DEVICE_GROUP = None
-    global _PP_GLOBAL_RANKS
-    _PP_GLOBAL_RANKS = None
+    global _TP
+    if _TP:
+        _TP.destroy()
+    _TP = None
+
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+
+
+def destroy_distributed_environment():
+    global _WORLD
+    if _WORLD:
+        _WORLD.destroy()
+    _WORLD = None
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
 
 
 def is_in_the_same_node(pg: ProcessGroup):
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index de616ef1d..476e9ba3b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -13,7 +13,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
-from vllm.distributed.communication_op import graph_capture
+from vllm.distributed.parallel_state import graph_capture
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
-- 
GitLab


From 916d219d62e9e4005e10be23f81d881afdb8d6d0 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 12 Jun 2024 17:58:12 -0700
Subject: [PATCH 016/376] [ci] Use sccache to build images (#5419)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-template-aws.j2 |  2 +-
 Dockerfile                      | 22 ++++++++++++++++++++--
 setup.py                        |  2 +-
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 645747ddd..09649b625 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -7,7 +7,7 @@ steps:
       queue: cpu_queue
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
       - "docker push {{ docker_image }}"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/Dockerfile b/Dockerfile
index eb96bf3c1..62c401069 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,7 +10,7 @@
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
 
 RUN apt-get update -y \
-    && apt-get install -y python3-pip git
+    && apt-get install -y python3-pip git curl sudo
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -70,10 +70,28 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
+ARG USE_SCCACHE
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=vllm-build-sccache \
+        && export SCCACHE_REGION=us-west-2 \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist \
+        && sccache --show-stats; \
+    fi
+
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
-    python3 setup.py bdist_wheel --dist-dir=dist
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        python3 setup.py bdist_wheel --dist-dir=dist; \
+    fi
 
 # check the size of the wheel, we cannot upload wheels larger than 100MB
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
diff --git a/setup.py b/setup.py
index 12e5c3456..3a41b1a0b 100644
--- a/setup.py
+++ b/setup.py
@@ -140,6 +140,7 @@ class cmake_build_ext(build_ext):
             cmake_args += [
                 '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
                 '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
             ]
         elif is_ccache_available():
             cmake_args += [
@@ -171,7 +172,6 @@ class cmake_build_ext(build_ext):
         else:
             # Default build tool to whatever cmake picks.
             build_tool = []
-
         subprocess.check_call(
             ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
             cwd=self.build_temp)
-- 
GitLab


From 88407532e7ec2dd3313f6cb3a31d8dd1fa868178 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Thu, 13 Jun 2024 11:16:41 +0800
Subject: [PATCH 017/376] =?UTF-8?q?[Bugfix]if=20the=20content=20is=20start?=
 =?UTF-8?q?ed=20with=20":"(response=20of=20ping),=20client=20should=20i?=
 =?UTF-8?q?=E2=80=A6=20(#5303)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 benchmarks/backend_request_func.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 58dcc6167..52386b8cd 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -68,9 +68,13 @@ async def async_request_tgi(
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
                             continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        #NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = remove_prefix(chunk_bytes, "data:")
 
                         data = json.loads(chunk)
                         timestamp = time.perf_counter()
-- 
GitLab


From c2637a613b6140dc16fecd5a1b0f5a9e1d0932ff Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 13 Jun 2024 10:19:56 -0400
Subject: [PATCH 018/376] [Kernel] `w4a16` support for `compressed-tensors`
 (#5385)

Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
---
 tests/quantization/test_compressed_tensors.py |  27 ++-
 .../compressed_tensors/compressed_tensors.py  |  44 ++++-
 .../compressed_tensors/schemes/__init__.py    |   1 +
 .../schemes/compressed_tensors_w4a16.py       | 168 ++++++++++++++++++
 4 files changed, 230 insertions(+), 10 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index e6d8218b4..5670498f2 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -3,12 +3,13 @@
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """
 
+import pytest
 import torch
 
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW4A16,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
@@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
         assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
         assert qkv_proj.weight.dtype is torch.int8
+
+
+@pytest.mark.parametrize("w4a16_args", [
+    ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None),
+    ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
+])
+def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
+    model, strategy, group = w4a16_args
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16)
+
+        assert qkv_proj.scheme.strategy == strategy
+        assert qkv_proj.scheme.group_size == group
+
+        assert qkv_proj.weight_packed.dtype is torch.int32
+        assert qkv_proj.weight_scale.dtype is torch.float16
+        assert qkv_proj.weight_packed.pack_factor == 8
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index d2b0ce0db..c7f047845 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -7,8 +7,8 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsScheme, CompressedTensorsW4A16,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)
 
@@ -47,16 +47,27 @@ class CompressedTensorsConfig(QuantizationConfig):
         layer_quant_details: Dict[str, Any] = dict()
         ignore: List[str] = config.get("ignore", None)
 
+        # The quant_config has multiple config_groups, each containing
+        # an input_activations key with details about how the activations are
+        # quantized, a weights key indicating how the weights are quantized,
+        # and a list of targets under the `targets` key, dictating which
+        # layers are impacted by the quantization details. The quantization
+        # details follow the structure defined by the QuantizationArgs
+        # pydantic model, which is used to verify the structure of the
+        # quant_config and also store the details for later use.
         for key, quant_config in config["config_groups"].items():
             targets = quant_config.get("targets")
             for target in targets:
                 layer_quant_details[target] = {}
                 layer_quant_details[target][
-                    "weight"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.parse_obj(
                         quant_config.get("weights"))
-                layer_quant_details[target][
-                    "input"] = QuantizationArgs.parse_obj(
-                        quant_config.get("input_activations"))
+                try:
+                    layer_quant_details[target][
+                        "input_activations"] = QuantizationArgs.parse_obj(
+                            quant_config.get("input_activations"))
+                except Exception:
+                    layer_quant_details[target]["input_activations"] = None
 
         return cls(layer_quant_details=layer_quant_details, ignore=ignore)
 
@@ -86,8 +97,23 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
 
+    def _is_w4a16(self, weight_quant: BaseModel,
+                  input_quant: BaseModel) -> bool:
+        input_quant_none = input_quant is None
+        is_4_bits = weight_quant.num_bits == 4
+        is_symmetric = weight_quant.symmetric
+        is_static = not weight_quant.dynamic
+
+        return is_4_bits and input_quant_none and is_symmetric and is_static
+
     def _get_schema(self, weight_quant: BaseModel,
                     input_quant: BaseModel) -> "CompressedTensorsScheme":
+
+        if self._is_w4a16(weight_quant, input_quant):
+            return CompressedTensorsW4A16(num_bits=weight_quant.num_bits,
+                                          strategy=weight_quant.strategy,
+                                          group_size=weight_quant.group_size)
+
         if self._is_static_tensor_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8StaticTensor()
 
@@ -113,8 +139,9 @@ class CompressedTensorsConfig(QuantizationConfig):
             raise ValueError(
                 f"Could not find quantization details for {layer}.")
 
-        return self._get_schema(weight_quant=layer_quant_details["weight"],
-                                input_quant=layer_quant_details["input"])
+        return self._get_schema(
+            weight_quant=layer_quant_details["weights"],
+            input_quant=layer_quant_details["input_activations"])
 
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
@@ -140,6 +167,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
             layer=layer,
             input_size_per_partition=input_size_per_partition,
             output_partition_sizes=output_partition_sizes,
+            input_size=input_size,
             output_size=output_size,
             params_dtype=params_dtype,
             weight_loader=weight_loader)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 9a910f061..dc84d0008 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,6 +1,7 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
+from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
 from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
     CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
new file mode 100644
index 000000000..90446a5ff
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
@@ -0,0 +1,168 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState,
+    marlin_permute_scales)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW4A16"]
+
+
+class CompressedTensorsW4A16(CompressedTensorsScheme):
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None):
+        self.num_bits = num_bits
+        self.strategy = strategy
+        self.group_size = group_size
+
+        if self.strategy == "group" and self.group_size is None:
+            raise ValueError(
+                "group_size must be given when using strategy group")
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        pack_factor = 32 // self.num_bits
+        output_size_per_partition = sum(output_partition_sizes)
+
+        if self.group_size is not None:
+            group_size = self.group_size
+        else:
+            group_size = input_size
+
+        weight_scale_dim = None
+        scales_and_zp_size = input_size // group_size
+
+        if (input_size != input_size_per_partition
+                and self.group_size is not None):
+            weight_scale_dim = 1
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        weight = Parameter(
+            torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(
+            weight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": pack_factor
+            })
+        set_weight_attrs(weight, {"weight_loader": weight_loader})
+
+        layer.register_parameter("weight_packed", weight)
+
+        weight_scale = Parameter(
+            torch.empty(
+                output_size_per_partition,
+                scales_and_zp_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+        set_weight_attrs(weight_scale, {
+            "input_dim": weight_scale_dim,
+            "output_dim": 0
+        })
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # A 2D array defining the original shape of the weights
+        # before packing
+        weight_shape = Parameter(torch.empty(2, dtype=torch.int64),
+                                 requires_grad=False)
+
+        layer.register_parameter("weight_shape", weight_shape)
+        set_weight_attrs(weight_shape, {"weight_loader": weight_loader})
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        layer.input_size = input_size
+        layer.marlin_state = GPTQMarlinState.REPACK
+        layer.is_k_full = True
+        layer.group_size = group_size
+
+        max_workspace_size = (
+            output_size_per_partition //
+            GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                requires_grad=False)
+        layer.workspace = workspace
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        size_m = reshaped_x.shape[0]
+        part_size_n = layer.output_size_per_partition
+        part_size_k = layer.input_size_per_partition
+
+        out_shape = x.shape[:-1] + (part_size_n, )
+
+        if layer.marlin_state == GPTQMarlinState.REPACK:
+            layer.marlin_state = GPTQMarlinState.READY
+
+            # Newly generated tensors need to replace existing tensors that are
+            # already registered as parameters by vLLM (and won't be freed)
+            def replace_tensor(name, new_t):
+                # It is important to use resize_() here since it ensures
+                # the same buffer is reused
+                getattr(layer, name).resize_(new_t.shape)
+                getattr(layer, name).copy_(new_t)
+                del new_t
+
+            cur_device = layer.weight_packed.device
+
+            # Reset g_idx related tensors
+            layer.g_idx = Parameter(torch.empty(0,
+                                                dtype=torch.int,
+                                                device=cur_device),
+                                    requires_grad=False)
+            layer.g_idx_sort_indices = Parameter(torch.empty(
+                0, dtype=torch.int, device=cur_device),
+                                                 requires_grad=False)
+
+            # Repack weights
+            marlin_qweight = ops.gptq_marlin_repack(
+                layer.weight_packed.t().contiguous(), layer.g_idx_sort_indices,
+                part_size_k, part_size_n, self.num_bits)
+
+            replace_tensor("weight_packed", marlin_qweight)
+
+            # Permute scales
+            scales_size_k = part_size_k
+            scales_size_n = part_size_n
+
+            marlin_scales = marlin_permute_scales(
+                layer.weight_scale.squeeze().t().contiguous(), scales_size_k,
+                scales_size_n, layer.group_size, self.num_bits)
+            replace_tensor("weight_scale", marlin_scales)
+
+        output = ops.gptq_marlin_gemm(reshaped_x, layer.weight_packed,
+                                      layer.weight_scale, layer.g_idx,
+                                      layer.g_idx_sort_indices,
+                                      layer.workspace, self.num_bits, size_m,
+                                      part_size_n, part_size_k,
+                                      layer.is_k_full)
+        return output.reshape(out_shape)
-- 
GitLab


From 23ec72fa032b3d81a5ea9eb0f7c607f1d6e7949a Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 13 Jun 2024 11:18:08 -0400
Subject: [PATCH 019/376] [CI/Build][REDO] Add is_quant_method_supported to
 control quantization test configurations (#5466)

---
 tests/models/test_aqlm.py               | 13 ++-----------
 tests/models/test_fp8.py                | 12 ++----------
 tests/models/test_gptq_marlin.py        | 13 ++-----------
 tests/models/test_gptq_marlin_24.py     | 13 ++-----------
 tests/models/test_marlin.py             | 13 ++-----------
 tests/quantization/test_bitsandbytes.py | 10 +++-------
 tests/quantization/test_fp8.py          | 15 +++++----------
 tests/quantization/utils.py             | 14 ++++++++++++++
 8 files changed, 32 insertions(+), 71 deletions(-)
 create mode 100644 tests/quantization/utils.py

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index c4ecf846e..80034a511 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
 """
 
 import pytest
-import torch
 
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-aqlm_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    aqlm_not_supported = (capability <
-                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
+from tests.quantization.utils import is_quant_method_supported
 
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
@@ -67,7 +58,7 @@ ground_truth_generations = [
 ]
 
 
-@pytest.mark.skipif(aqlm_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                     reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 61aee0d0a..b24c17cf3 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -8,8 +8,8 @@ import pytest
 import torch
 from transformers import AutoTokenizer
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -67,16 +67,8 @@ EXPECTED_STRS_MAP = {
     },
 }
 
-fp8_not_supported = True
 
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    fp8_not_supported = (capability <
-                         QUANTIZATION_METHODS["fp8"].get_min_capability())
-
-
-@pytest.mark.skipif(fp8_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index e957450cc..e30100d9b 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
 import os
 
 import pytest
-import torch
 
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
 
 from .utils import check_logprobs_close
@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 MAX_MODEL_LEN = 1024
 
-gptq_marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    gptq_marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
-
 MODELS = [
     # act_order==False, group_size=channelwise
     ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
@@ -53,7 +44,7 @@ MODELS = [
 
 
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(gptq_marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py
index 195c3e5b5..60d9ae2f1 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
 from dataclasses import dataclass
 
 import pytest
-import torch
 
 from tests.models.utils import check_logprobs_close
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
+from tests.quantization.utils import is_quant_method_supported
 
 
 @dataclass
@@ -47,7 +38,7 @@ model_pairs = [
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                     reason="Marlin24 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 761ba6aa4..e86f6e29d 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
 from dataclasses import dataclass
 
 import pytest
-import torch
 
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
 
 from .utils import check_logprobs_close
 
-marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
-
 
 @dataclass
 class ModelPair:
@@ -45,7 +36,7 @@ model_pairs = [
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("marlin"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 31e938d15..953fd9ba9 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
 import pytest
 import torch
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm import SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
 
-
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
-    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
 def test_load_bnb_model(vllm_runner) -> None:
     with vllm_runner('huggyllama/llama-7b',
                      quantization='bitsandbytes',
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 7cb65326c..74d21ead0 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -5,17 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
 import pytest
 import torch
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm._custom_ops import scaled_fp8_quant
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
 
-
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
     with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
 
@@ -25,9 +21,8 @@ def test_load_fp16_model(vllm_runner) -> None:
         assert fc1.weight.dtype == torch.float8_e4m3fn
 
 
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_scaled_fp8_quant(dtype) -> None:
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
new file mode 100644
index 000000000..0c92d565d
--- /dev/null
+++ b/tests/quantization/utils.py
@@ -0,0 +1,14 @@
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+
+def is_quant_method_supported(quant_method: str) -> bool:
+    # Currently, all quantization methods require Nvidia or AMD GPUs
+    if not torch.cuda.is_available():
+        return False
+
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    return (capability <
+            QUANTIZATION_METHODS[quant_method].get_min_capability())
-- 
GitLab


From bd43973522ea17be50e10fbb222a22f673c8067e Mon Sep 17 00:00:00 2001
From: wenyujin333 <wuyou.wuyou@alibaba-inc.com>
Date: Fri, 14 Jun 2024 00:01:10 +0800
Subject: [PATCH 020/376] [Kernel] Tune Qwen2MoE kernel configurations with
 tp2,4 (#5497)

Tune Qwen2-57B-A14B configs based on #4921

Throughput Performance
command: python benchmarks/benchmark_throughput.py --model=Qwen/Qwen2-57B-A14B-Instruct --input-len 1000 --output-len 50 -tp 2

A100 GPU

benchmark	no config	w/ PR
tp=2	10.53 requests/s, 11058.17 tokens/s	12.47 requests/s, 13088.57 tokens/s
tp=4	17.77 requests/s, 18662.95 tokens/s	20.20 requests/s, 21212.32 tokens/s
---
 ...280,device_name=NVIDIA_A100-SXM4-80GB.json | 146 ++++++++++++++++++
 ...280,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 ...640,device_name=NVIDIA_A100-SXM4-80GB.json | 146 ++++++++++++++++++
 ...640,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 4 files changed, 584 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..8cc6c643f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..d4c9ddd12
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..b2799ed3a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..b8d3be231
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
-- 
GitLab


From 80aa7e91fcd547a7a1396f71b9bdce18e5c92245 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 14 Jun 2024 00:33:14 +0800
Subject: [PATCH 021/376] [Hardware][Intel] Optimize CPU backend and add more
 performance tips (#4971)

Co-authored-by: Jianan Gu <jianan.gu@intel.com>
---
 Dockerfile.cpu                                |   8 +-
 README.md                                     |   2 +-
 .../getting_started/cpu-installation.rst      |  23 +++-
 requirements-cpu.txt                          |   2 +-
 vllm/attention/backends/torch_sdpa.py         |  23 +++-
 vllm/attention/ops/ipex_attn.py               | 120 ++++++++++++++++++
 6 files changed, 165 insertions(+), 13 deletions(-)
 create mode 100644 vllm/attention/ops/ipex_attn.py

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 403a1cd03..777bb0829 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -3,9 +3,13 @@
 FROM ubuntu:22.04 AS cpu-test-1
 
 RUN apt-get update  -y \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
+RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
+
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+
 RUN pip install --upgrade pip \
     && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
 
@@ -21,6 +25,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/
 
-RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
 CMD ["/bin/bash"]
diff --git a/README.md b/README.md
index 57374d279..8e4480ac2 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs and AMD GPUs
+- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support
 
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index 5270253ca..a9544e8a5 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -10,6 +10,7 @@ Table of contents:
 #. :ref:`Requirements <cpu_backend_requirements>`
 #. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
 #. :ref:`Build from source <build_cpu_backend_from_source>`
+#. :ref:`Intel Extension for PyTorch <ipex_guidance>`
 #. :ref:`Performance tips <cpu_backend_performance_tips>`
 
 .. _cpu_backend_requirements:
@@ -18,7 +19,7 @@ Requirements
 ------------
 
 * OS: Linux
-* Compiler: gcc/g++>=12.3.0 (recommended)
+* Compiler: gcc/g++>=12.3.0 (optional, recommended)
 * Instruction set architecture (ISA) requirement: AVX512 is required.
 
 .. _cpu_backend_quick_start_dockerfile:
@@ -41,7 +42,7 @@ Quick start using Dockerfile
 Build from source
 -----------------
 
-- First, install required compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
 
 .. code-block:: console
 
@@ -70,6 +71,15 @@ Build from source
     
     - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.    
 
+.. _ipex_guidance:
+
+Intel Extension for PyTorch
+---------------------------
+
+- `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+
+- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed.
+
 .. _cpu_backend_performance_tips:
 
 Performance tips
@@ -77,6 +87,15 @@ Performance tips
 
 - vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 
+- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
+
+.. code-block:: console
+
+    $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
+    $ find / -name *libtcmalloc* # find the dynamic link library path
+    $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
+    $ python examples/offline_inference.py # run vLLM
+
 - vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. 
 
 - If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading.
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index b739642d8..8b7d86e68 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,5 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.0+cpu
+torch == 2.3.1+cpu
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 9b50adec5..4b08cce99 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -8,8 +8,16 @@ from torch.nn.functional import scaled_dot_product_attention
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata)
-from vllm.attention.ops.paged_attn import (PagedAttention,
-                                           PagedAttentionMetadata)
+from vllm.attention.ops.paged_attn import PagedAttentionMetadata
+from vllm.utils import is_cpu
+
+if is_cpu():
+    try:
+        from vllm.attention.ops.ipex_attn import PagedAttention
+    except ImportError:
+        from vllm.attention.ops.paged_attn import PagedAttention
+else:
+    from vllm.attention.ops.paged_attn import PagedAttention
 
 
 class TorchSDPABackend(AttentionBackend):
@@ -197,13 +205,14 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
                                          attn_metadata.attn_bias):
                     end = start + seq_len
                     sub_out = scaled_dot_product_attention(
-                        query[:, start:end, :],
-                        key[:, start:end, :],
-                        value[:, start:end, :],
+                        query[None, :, start:end, :],
+                        key[None, :, start:end, :],
+                        value[None, :, start:end, :],
                         attn_mask=mask,
                         dropout_p=0.0,
                         is_causal=not self.need_mask,
-                        scale=self.scale).movedim(query.dim() - 2, 0)
+                        scale=self.scale).squeeze(0).movedim(
+                            query.dim() - 2, 0)
                     output[start:end, :, :] = sub_out
                     start = end
             else:
@@ -248,7 +257,7 @@ def _make_alibi_bias(
 
         num_heads = alibi_slopes.shape[0]
         bias = bias[None, :].repeat((num_heads, 1, 1))
-        bias.mul_(alibi_slopes[:, None, None])
+        bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
         inf_mask = torch.empty(
             (1, seq_len, seq_len),
             dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
new file mode 100644
index 000000000..5a5317b65
--- /dev/null
+++ b/vllm/attention/ops/ipex_attn.py
@@ -0,0 +1,120 @@
+from typing import Dict, List, Optional, Tuple
+
+import intel_extension_for_pytorch.llm.modules as ipex_modules
+import torch
+
+from vllm import _custom_ops as ops
+
+
+class PagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, block_size * num_kv_heads * head_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        kv_scale: float,
+        *args,
+    ) -> None:
+        ipex_modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache,
+            slot_mapping.flatten().int())
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_scale: float,
+        *args,
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+        block_size = value_cache.shape[2]
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            device="cpu",
+            dtype=torch.int32,
+        ).view(num_kv_heads,
+               1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
+        ipex_modules.PagedAttention.single_query_cached_kv_attention(
+            output, query.contiguous(), key_cache, value_cache, head_mapping,
+            scale, block_tables, context_lens, block_size, max_context_len,
+            alibi_slopes)
+
+        return output
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        subquery_start_loc: torch.Tensor,
+        prompt_lens_tensor: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_subquery_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        *args,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+        *args,
+    ) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+        *args,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
-- 
GitLab


From a65634d3ae8928284b3923a46bff89731cb1792e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 13 Jun 2024 10:18:26 -0700
Subject: [PATCH 022/376] [Docs] Add 4th meetup slides (#5509)

---
 README.md                         | 8 +-------
 docs/source/community/meetups.rst | 1 +
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 8e4480ac2..ce2d8d5fd 100644
--- a/README.md
+++ b/README.md
@@ -23,16 +23,10 @@ If you have cool projects related to vLLM or LLM inference, we would love to see
 This will be a great chance for everyone in the community to get together and learn.
 Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
 
-**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
-
-We are thrilled to announce our fourth vLLM Meetup!
-The vLLM team will share recent updates and roadmap.
-We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
-Please register [here](https://lu.ma/agivllm) and join us!
-
 ---
 
 *Latest News* 🔥
+- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
index f37119478..0fde31ef9 100644
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
 - `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__
 - `The first vLLM meetup <https://lu.ma/first-vllm-meetup>`__, with a16z, October 5th 2023. `[Slides] <https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing>`__
-- 
GitLab


From 03dccc886ef7e5d0dd67512f3e9748ee00c21fb2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Jun 2024 02:21:39 +0800
Subject: [PATCH 023/376] [Misc] Add vLLM version getter to utils (#5098)

---
 setup.py                              | 2 +-
 vllm/__init__.py                      | 3 ++-
 vllm/engine/llm_engine.py             | 4 ++--
 vllm/entrypoints/openai/api_server.py | 6 +++---
 vllm/entrypoints/openai/run_batch.py  | 4 ++--
 vllm/usage/usage_lib.py               | 4 ++--
 vllm/version.py                       | 1 +
 7 files changed, 13 insertions(+), 11 deletions(-)
 create mode 100644 vllm/version.py

diff --git a/setup.py b/setup.py
index 3a41b1a0b..12a704e08 100644
--- a/setup.py
+++ b/setup.py
@@ -314,7 +314,7 @@ def find_version(filepath: str) -> str:
 
 
 def get_vllm_version() -> str:
-    version = find_version(get_path("vllm", "__init__.py"))
+    version = find_version(get_path("vllm", "version.py"))
 
     if _is_cuda():
         cuda_version = str(get_nvcc_cuda_version())
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 10cc66941..e21705987 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -12,9 +12,10 @@ from vllm.outputs import (CompletionOutput, EmbeddingOutput,
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
-__version__ = "0.5.0"
+from .version import __version__
 
 __all__ = [
+    "__version__",
     "LLM",
     "ModelRegistry",
     "PromptStrictInputs",
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ea7547584..b2f6478cb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -6,7 +6,6 @@ from typing import Type, TypeVar, Union
 
 from transformers import GenerationConfig, PreTrainedTokenizer
 
-import vllm
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
                          LoRAConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig, SpeculativeConfig,
@@ -38,6 +37,7 @@ from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
 from vllm.utils import Counter
+from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
@@ -169,7 +169,7 @@ class LLMEngine:
             "enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
             "decoding_config=%r, seed=%d, served_model_name=%s)",
-            vllm.__version__,
+            VLLM_VERSION,
             model_config.model,
             speculative_config,
             model_config.tokenizer,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e7503b965..ea6275920 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -15,7 +15,6 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse
 from prometheus_client import make_asgi_app
 from starlette.routing import Mount
 
-import vllm
 import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -29,6 +28,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
+from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
@@ -93,7 +93,7 @@ async def show_available_models():
 
 @app.get("/version")
 async def show_version():
-    ver = {"version": vllm.__version__}
+    ver = {"version": VLLM_VERSION}
     return JSONResponse(content=ver)
 
 
@@ -174,7 +174,7 @@ if __name__ == "__main__":
             raise ValueError(f"Invalid middleware {middleware}. "
                              f"Must be a function or a class.")
 
-    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
     if args.served_model_name is not None:
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 731f4f4a4..7a6819c35 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -5,7 +5,6 @@ from io import StringIO
 
 import aiohttp
 
-import vllm
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (BatchRequestInput,
@@ -15,6 +14,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import random_uuid
+from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
 
@@ -135,7 +135,7 @@ async def main(args):
 if __name__ == "__main__":
     args = parse_args()
 
-    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
     asyncio.run(main(args))
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 40a954a29..afb3007a5 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -16,6 +16,7 @@ import requests
 import torch
 
 import vllm.envs as envs
+from vllm.version import __version__ as VLLM_VERSION
 
 _config_home = envs.VLLM_CONFIG_ROOT
 _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
@@ -163,9 +164,8 @@ class UsageMessage:
         ])
 
         # vLLM information
-        import vllm  # delayed import to prevent circular import
         self.context = usage_context.value
-        self.vllm_version = vllm.__version__
+        self.vllm_version = VLLM_VERSION
         self.model_architecture = model_architecture
 
         # Metadata
diff --git a/vllm/version.py b/vllm/version.py
new file mode 100644
index 000000000..3d187266f
--- /dev/null
+++ b/vllm/version.py
@@ -0,0 +1 @@
+__version__ = "0.5.0"
-- 
GitLab


From 39873476f8a1cf97bdf5651b4535ae60358ff15b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Jun 2024 02:21:53 +0800
Subject: [PATCH 024/376] [CI/Build] Simplify OpenAI server setup in tests
 (#5100)

---
 tests/async_engine/test_openapi_server_ray.py |  31 ++-
 tests/entrypoints/test_openai_embedding.py    | 113 ++++++++++
 tests/entrypoints/test_openai_server.py       | 206 ++++--------------
 tests/entrypoints/test_openai_vision.py       |  35 ++-
 tests/tensorizer_loader/test_tensorizer.py    |  14 +-
 tests/utils.py                                | 124 ++++++++---
 6 files changed, 285 insertions(+), 238 deletions(-)
 create mode 100644 tests/entrypoints/test_openai_embedding.py

diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index c25875bd1..cc05d79e5 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -4,16 +4,22 @@ import pytest
 # and debugging.
 import ray
 
-from ..utils import ServerRunner
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
 
 
 @pytest.fixture(scope="module")
-def server():
-    ray.init()
-    server_runner = ServerRunner.remote([
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(ray_ctx):
+    return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
@@ -24,22 +30,15 @@ def server():
         "--enforce-eager",
         "--engine-use-ray"
     ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
 
 
 @pytest.fixture(scope="module")
-def client():
-    client = openai.AsyncOpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
+def client(server):
+    return server.get_async_client()
 
 
 @pytest.mark.asyncio
-async def test_check_models(server, client: openai.AsyncOpenAI):
+async def test_check_models(client: openai.AsyncOpenAI):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
@@ -48,7 +47,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_single_completion(server, client: openai.AsyncOpenAI):
+async def test_single_completion(client: openai.AsyncOpenAI):
     completion = await client.completions.create(model=MODEL_NAME,
                                                  prompt="Hello, my name is",
                                                  max_tokens=5,
@@ -72,7 +71,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_single_chat_session(server, client: openai.AsyncOpenAI):
+async def test_single_chat_session(client: openai.AsyncOpenAI):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/test_openai_embedding.py
new file mode 100644
index 000000000..2496d2ac3
--- /dev/null
+++ b/tests/entrypoints/test_openai_embedding.py
@@ -0,0 +1,113 @@
+import openai
+import pytest
+import ray
+
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+
+EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+
+pytestmark = pytest.mark.openai
+
+
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def embedding_server(ray_ctx):
+    return RemoteOpenAIServer([
+        "--model",
+        EMBEDDING_MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ])
+
+
+@pytest.mark.asyncio
+@pytest.fixture(scope="module")
+def embedding_client(embedding_server):
+    return embedding_server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
+                                model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single embedding
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 9
+    assert embeddings.usage.total_tokens == 9
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
+                               model_name: str):
+    # test List[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 4096
+
+    # test List[List[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index fdf704705..2d7e3044d 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -15,11 +15,10 @@ from openai import BadRequestError
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ..utils import ServerRunner
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
@@ -80,9 +79,15 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):
-    ray.init()
-    server_runner = ServerRunner.remote([
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, ray_ctx):
+    return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
@@ -91,8 +96,6 @@ def server(zephyr_lora_files):
         "--max-model-len",
         "8192",
         "--enforce-eager",
-        "--gpu-memory-utilization",
-        "0.75",
         # lora config below
         "--enable-lora",
         "--lora-modules",
@@ -105,43 +108,14 @@ def server(zephyr_lora_files):
         "--max-num-seqs",
         "128",
     ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
 
 
 @pytest.fixture(scope="module")
-def embedding_server(zephyr_lora_files):
-    ray.shutdown()
-    ray.init()
-    server_runner = ServerRunner.remote([
-        "--model",
-        EMBEDDING_MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--enforce-eager",
-        "--gpu-memory-utilization",
-        "0.75",
-        "--max-model-len",
-        "8192",
-    ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def client():
-    client = openai.AsyncOpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
+def client(server):
+    return server.get_async_client()
 
 
-@pytest.mark.asyncio
-async def test_check_models(server, client: openai.AsyncOpenAI):
+async def test_check_models(client: openai.AsyncOpenAI):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
@@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
     "model_name",
     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_single_completion(server, client: openai.AsyncOpenAI,
-                                 model_name: str):
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
     completion = await client.completions.create(model=model_name,
                                                  prompt="Hello, my name is",
                                                  max_tokens=5,
@@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_no_logprobs(server, client: openai.AsyncOpenAI,
-                           model_name: str):
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
     # test using token IDs
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
-                             model_name: str):
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
     # test using token IDs
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_some_logprobs(server, client: openai.AsyncOpenAI,
-                             model_name: str):
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
     # test using token IDs
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
                                             model_name: str):
 
     with pytest.raises(
@@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
-                                model_name: str):
+async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
-                                  model_name: str):
+async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
-                                  model_name: str):
+async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
                                       model_name: str):
     messages = [{
         "role": "system",
@@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+async def test_single_chat_session(client: openai.AsyncOpenAI,
                                    model_name: str):
     messages = [{
         "role": "system",
@@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_completion_streaming(server, client: openai.AsyncOpenAI,
+async def test_completion_streaming(client: openai.AsyncOpenAI,
                                     model_name: str):
     prompt = "What is an LLM?"
 
@@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_chat_streaming(server, client: openai.AsyncOpenAI,
-                              model_name: str):
+async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
     "model_name",
     ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_chat_completion_stream_options(server,
-                                              client: openai.AsyncOpenAI,
+async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
                                               model_name: str):
     messages = [{
         "role": "system",
@@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server,
     "model_name",
     ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
                                          model_name: str):
     prompt = "What is the capital of France?"
 
@@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_batch_completions(server, client: openai.AsyncOpenAI,
-                                 model_name: str):
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
     # test simple list
     batch = await client.completions.create(
         model=model_name,
@@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_logits_bias(server, client: openai.AsyncOpenAI):
+async def test_logits_bias(client: openai.AsyncOpenAI):
     prompt = "Hello, my name is"
     max_tokens = 5
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
@@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str):
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                 guided_decoding_backend: str):
     messages = [{
         "role": "system",
@@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str):
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_regex_chat(client: openai.AsyncOpenAI,
                                  guided_decoding_backend: str):
     messages = [{
         "role": "system",
@@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                         guided_decoding_backend: str):
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
                                   guided_decoding_backend: str):
     messages = [{
         "role": "system",
@@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str):
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
@@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
                                            guided_decoding_backend: str):
     messages = [{
         "role": "system",
@@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
-async def test_named_tool_use(server, client: openai.AsyncOpenAI,
+async def test_named_tool_use(client: openai.AsyncOpenAI,
                               guided_decoding_backend: str):
     messages = [{
         "role": "system",
@@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_required_tool_use_not_yet_supported(
-        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+        client: openai.AsyncOpenAI, guided_decoding_backend: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_inconsistent_tool_choice_and_tools(
-        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+        client: openai.AsyncOpenAI, guided_decoding_backend: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools(
 
 
 @pytest.mark.asyncio
-async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
+async def test_response_format_json_object(client: openai.AsyncOpenAI):
     for _ in range(2):
         resp = await client.chat.completions.create(
             model=MODEL_NAME,
@@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_extra_fields(server, client: openai.AsyncOpenAI):
+async def test_extra_fields(client: openai.AsyncOpenAI):
     with pytest.raises(BadRequestError) as exc_info:
         await client.chat.completions.create(
             model=MODEL_NAME,
@@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_complex_message_content(server, client: openai.AsyncOpenAI):
+async def test_complex_message_content(client: openai.AsyncOpenAI):
     resp = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=[{
@@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_custom_role(server, client: openai.AsyncOpenAI):
+async def test_custom_role(client: openai.AsyncOpenAI):
     # Not sure how the model handles custom roles so we just check that
     # both string and complex message content are handled in the same way
 
@@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_guided_grammar(server, client: openai.AsyncOpenAI):
+async def test_guided_grammar(client: openai.AsyncOpenAI):
     simple_sql_grammar = """
 start: select_statement
 
@@ -1351,7 +1315,7 @@ number: "1" | "2"
     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
 @pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
                                        model_name: str, logprobs_arg: int):
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
     # test using text and token IDs
@@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_long_seed(server, client: openai.AsyncOpenAI):
+async def test_long_seed(client: openai.AsyncOpenAI):
     for seed in [
             torch.iinfo(torch.long).min - 1,
             torch.iinfo(torch.long).max + 1
@@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
                 or "less_than_equal" in exc_info.value.message)
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
-                                model_name: str):
-    input_texts = [
-        "The chef prepared a delicious meal.",
-    ]
-
-    # test single embedding
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 9
-    assert embeddings.usage.total_tokens == 9
-
-    # test using token IDs
-    input_tokens = [1, 1, 1, 1, 1]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 5
-    assert embeddings.usage.total_tokens == 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
-                               model_name: str):
-    # test List[str]
-    input_texts = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
-    ]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 3
-    assert len(embeddings.data[0].embedding) == 4096
-
-    # test List[List[int]]
-    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-                    [25, 32, 64, 77]]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 4
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 17
-    assert embeddings.usage.total_tokens == 17
-
-
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py
index cc03b04e0..03dc5d116 100644
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/test_openai_vision.py
@@ -8,7 +8,7 @@ import ray
 
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
 
-from ..utils import ServerRunner
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
 LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
@@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [
 pytestmark = pytest.mark.openai
 
 
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
 @pytest.fixture(scope="module")
 def server():
-    ray.init()
-    server_runner = ServerRunner.remote([
+    return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
         "--dtype",
@@ -47,18 +53,11 @@ def server():
         "--chat-template",
         str(LLAVA_CHAT_TEMPLATE),
     ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
 
 
-@pytest.fixture(scope="session")
-def client():
-    client = openai.AsyncOpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
 
 
 @pytest_asyncio.fixture(scope="session")
@@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
+async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                          model_name: str, image_url: str):
     messages = [{
         "role":
@@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded(
-        server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        client: openai.AsyncOpenAI, model_name: str, image_url: str,
         base64_encoded_image: Dict[str, str]):
 
     messages = [{
@@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
+async def test_chat_streaming_image(client: openai.AsyncOpenAI,
                                     model_name: str, image_url: str):
     messages = [{
         "role":
@@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_multi_image_input(server, client: openai.AsyncOpenAI,
-                                 model_name: str, image_url: str):
+async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
+                                 image_url: str):
 
     messages = [{
         "role":
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 9656cf5f4..c8f86133f 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -22,11 +22,12 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          tensorize_vllm_model)
 
 from ..conftest import VllmRunner, cleanup
-from ..utils import ServerRunner
+from ..utils import RemoteOpenAIServer
 
 # yapf conflicts with isort for this docstring
 
 
+
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -216,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     openai_args = [
         "--model", model_ref, "--dtype", "float16", "--load-format",
         "tensorizer", "--model-loader-extra-config",
-        json.dumps(model_loader_extra_config), "--port", "8000"
+        json.dumps(model_loader_extra_config),
     ]
 
-    server = ServerRunner.remote(openai_args)
-
-    assert ray.get(server.ready.remote())
+    server = RemoteOpenAIServer(openai_args)
     print("Server ready.")
 
-    client = openai.OpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
+    client = server.get_client()
     completion = client.completions.create(model=model_ref,
                                            prompt="Hello, my name is",
                                            max_tokens=5,
diff --git a/tests/utils.py b/tests/utils.py
index cc8b86276..c84364d20 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,57 +4,109 @@ import sys
 import time
 import warnings
 from contextlib import contextmanager
+from typing import List
 
+import openai
 import ray
 import requests
 
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
+from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.utils import get_open_port
 
 # Path to root of repository so that utilities can be imported by ray workers
 VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
 
 
-@ray.remote(num_gpus=1)
-class ServerRunner:
+class RemoteOpenAIServer:
+    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
     MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 
-    def __init__(self, args):
-        env = os.environ.copy()
-        env["PYTHONUNBUFFERED"] = "1"
-        self.proc = subprocess.Popen(
-            [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
-            args,
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
+    @ray.remote(num_gpus=1)
+    class _RemoteRunner:
+
+        def __init__(self, cli_args: List[str], *, wait_url: str,
+                     wait_timeout: float) -> None:
+            env = os.environ.copy()
+            env["PYTHONUNBUFFERED"] = "1"
+            self.proc = subprocess.Popen(
+                [
+                    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+                    *cli_args
+                ],
+                env=env,
+                stdout=sys.stdout,
+                stderr=sys.stderr,
+            )
+
+            self._wait_for_server(url=wait_url, timeout=wait_timeout)
+
+        def ready(self):
+            return True
+
+        def _wait_for_server(self, *, url: str, timeout: float):
+            # run health check
+            start = time.time()
+            while True:
+                try:
+                    if requests.get(url).status_code == 200:
+                        break
+                except Exception as err:
+                    if self.proc.poll() is not None:
+                        raise RuntimeError(
+                            "Server exited unexpectedly.") from err
+
+                    time.sleep(0.5)
+                    if time.time() - start > timeout:
+                        raise RuntimeError(
+                            "Server failed to start in time.") from err
+
+        def __del__(self):
+            if hasattr(self, "proc"):
+                self.proc.terminate()
+
+    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
+        if auto_port:
+            if "-p" in cli_args or "--port" in cli_args:
+                raise ValueError("You have manually specified the port"
+                                 "when `auto_port=True`.")
+
+            cli_args = cli_args + ["--port", str(get_open_port())]
+
+        parser = make_arg_parser()
+        args = parser.parse_args(cli_args)
+        self.host = str(args.host or 'localhost')
+        self.port = int(args.port)
+
+        self._runner = self._RemoteRunner.remote(
+            cli_args,
+            wait_url=self.url_for("health"),
+            wait_timeout=self.MAX_SERVER_START_WAIT_S)
+
+        self._wait_until_ready()
+
+    @property
+    def url_root(self) -> str:
+        return f"http://{self.host}:{self.port}"
+
+    def url_for(self, *parts: str) -> str:
+        return self.url_root + "/" + "/".join(parts)
+
+    def _wait_until_ready(self) -> None:
+        ray.get(self._runner.ready.remote())
+
+    def get_client(self):
+        return openai.OpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
+        )
+
+    def get_async_client(self):
+        return openai.AsyncOpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
         )
-        self._wait_for_server()
-
-    def ready(self):
-        return True
-
-    def _wait_for_server(self):
-        # run health check
-        start = time.time()
-        while True:
-            try:
-                if requests.get(
-                        "http://localhost:8000/health").status_code == 200:
-                    break
-            except Exception as err:
-                if self.proc.poll() is not None:
-                    raise RuntimeError("Server exited unexpectedly.") from err
-
-                time.sleep(0.5)
-                if time.time() - start > self.MAX_SERVER_START_WAIT_S:
-                    raise RuntimeError(
-                        "Server failed to start in time.") from err
-
-    def __del__(self):
-        if hasattr(self, "proc"):
-            self.proc.terminate()
 
 
 def init_test_distributed_environment(
-- 
GitLab


From 0ce7b952f8eafdb13a7b6de3af53157c7aae98d4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Jun 2024 02:22:07 +0800
Subject: [PATCH 025/376] [Doc] Update LLaVA docs (#5437)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/vlm.rst               |  4 +--
 vllm/model_executor/models/llava.py      | 29 +++++++++++---------
 vllm/model_executor/models/llava_next.py | 34 ++++++++----------------
 3 files changed, 29 insertions(+), 38 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 5ab4157cb..70ac82e20 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -20,9 +20,9 @@ The following :ref:`engine arguments <engine_args>` are specific to VLMs:
     Currently, the support for vision language models on vLLM has the following limitations:
 
     * Only single image input is supported per text prompt.
-    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means model output might not exactly match the HuggingFace implementation.
+    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means our LLaVA-NeXT output may not exactly match the huggingface implementation.
 
-    We are continuously improving user & developer experience for VLMs. Please raise an issue on GitHub if you have any feedback or feature requests.
+    We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
 
 Offline Batched Inference
 -------------------------
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 67b32a088..39355b9d3 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -227,7 +227,7 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
         attn_metadata: AttentionMetadata,
         **kwargs: object,
     ) -> SamplerOutput:
-        """Run forward pass for Llava 1.5.
+        """Run forward pass for LLaVA-1.5.
 
         One key thing to understand is the `input_ids` already accounts for the
         positions of the to-be-inserted image embeddings.
@@ -247,22 +247,25 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
 
-        The model takes two types of image inputs:
-        PIXEL_VALUES and IMAGE_FEATURES.
-        The following shows how each maps to huggingface implementation.
-        PIXEL_VALUES:
-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
-        IMAGE_FEATURES:
-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
-        before going through the multi modal projector.
+        This model has two modes of image inputs:
+        `PIXEL_VALUES` and `IMAGE_FEATURES`.
 
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
-            pixel_values: For PIXEL_VALUES, expects a batch with shape
-                [1, 3, 336, 336].
-            image_features: For IMAGE_FEATURES, expects a batch with shape
-                [1, 576, 1024].
+            pixel_values: The pixels in each input image.
+                Expects a batch with shape `[1, 3, 336, 336]`.
+                (Only applicable to `PIXEL_VALUES` mode)
+            image_features: The image features for each input image outputted by
+                the vision tower before passing to the multi-modal projector.
+                Expects a batch with shape `[1, 576, 1024]`.
+                (Only applicable to `IMAGE_FEATURES` mode)
+
+        See also:
+            Each input maps to huggingface implementation, as follows:
+
+            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360
+            - `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437
         """
         image_input = self._parse_and_validate_image_input(**kwargs)
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 57cbd1e4a..0ab9afea9 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -108,15 +108,6 @@ def _image_pixel_processor(
 @MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
 @MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
-    """
-    Args to `forward()`:
-        input_ids: Flattened (concatenated) input_ids corresponding to a
-            batch.
-        pixel_values: For PIXEL_VALUES, expects a batch with shape
-            [1, num_patches, 3, 336, 336].
-        image_features: For IMAGE_FEATURES, expects a batch with shape
-            [1, num_patches, 1176, 1024].
-    """
 
     def __init__(self,
                  config: LlavaNextConfig,
@@ -355,7 +346,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
         attn_metadata: AttentionMetadata,
         **kwargs: object,
     ) -> SamplerOutput:
-        """Run forward pass for Llava 1.5.
+        """Run forward pass for LlaVA-NeXT.
 
         One key thing to understand is the `input_ids` already accounts for the
         positions of the to-be-inserted image embeddings.
@@ -375,22 +366,19 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
 
-        The model takes two types of image inputs:
-        PIXEL_VALUES and IMAGE_FEATURES.
-        The following shows how each maps to huggingface implementation.
-        PIXEL_VALUES:
-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
-        IMAGE_FEATURES:
-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
-        before going through the multi modal projector.
-
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
-            pixel_values: For PIXEL_VALUES, expects a batch with shape
-                [1, 3, 336, 336].
-            image_features: For IMAGE_FEATURES, expects a batch with shape
-                [1, 576, 1024].
+            pixel_values: The pixels in each grid patch for each input image.
+                Expects a batch with shape `[1, num_patches, 3, 336, 336]`.
+            image_sizes: The original `(width, height)` for each input image.
+                Expects a batch with shape `[1, 2]`.
+
+        See also:
+            Each input maps to huggingface implementation, as follows:
+
+            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L690
+            - `image_sizes`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L691
         """
         image_input = self._parse_and_validate_image_input(**kwargs)
 
-- 
GitLab


From 85657b56071b7c21586d88389c6e817f11c69e04 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 13 Jun 2024 14:22:19 -0400
Subject: [PATCH 026/376] [Kernel] Factor out epilogues from cutlass kernels
 (#5391)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: zifeitong <zifei.tong@parasail.io>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
---
 CMakeLists.txt                                |   8 +-
 .../cutlass_benchmarks/w8a8_benchmarks.py     |   6 +-
 csrc/ops.h                                    |   6 +-
 .../{scaled_mm_dq_c2x.cu => scaled_mm_c2x.cu} | 199 ++++++++++--------
 .../{scaled_mm_dq_c3x.cu => scaled_mm_c3x.cu} | 194 +++++++++--------
 ...aled_mm_dq_entry.cu => scaled_mm_entry.cu} |  48 ++---
 csrc/torch_bindings.cpp                       |   8 +-
 tests/kernels/test_cutlass.py                 |  18 +-
 vllm/_custom_ops.py                           |   9 +-
 .../compressed_tensors_w8a8_dynamictoken.py   |   4 +-
 .../compressed_tensors_w8a8_statictensor.py   |   4 +-
 .../model_executor/layers/quantization/fp8.py |   2 +-
 12 files changed, 274 insertions(+), 232 deletions(-)
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_dq_c2x.cu => scaled_mm_c2x.cu} (71%)
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_dq_c3x.cu => scaled_mm_c3x.cu} (66%)
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_dq_entry.cu => scaled_mm_entry.cu} (50%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad6736c47..aa15b632c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,9 +179,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
     "csrc/custom_all_reduce.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
 
   #
   # The CUTLASS kernels for Hopper require sm90a to be enabled.
@@ -189,7 +189,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
     set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
           PROPERTIES
           COMPILE_FLAGS
           "-gencode arch=compute_90a,code=sm_90a")
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 6de56f618..182105f0b 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -76,11 +76,7 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
 def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                  scale_b: torch.tensor,
                  out_dtype: torch.dtype) -> torch.tensor:
-    return ops.cutlass_scaled_mm_dq(a,
-                                    b,
-                                    scale_a,
-                                    scale_b,
-                                    out_dtype=out_dtype)
+    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
 
 
 # bench
diff --git a/csrc/ops.h b/csrc/ops.h
index 0c270a78c..9e2e977fa 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -90,9 +90,9 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
-void cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
-                          torch::Tensor const& b, torch::Tensor const& a_scales,
-                          torch::Tensor const& b_scales);
+void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales);
 
 #endif
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
similarity index 71%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
rename to csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 23a8b4070..7651268dc 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -29,21 +29,14 @@
 using namespace cute;
 
 /*
-   This defines a quantized GEMM operation with dequantized output, similar to
-   torch._scaled_mm. It is defined using the CUTLASS 2.x API, and is used for
+   This file defines quantized GEMM operations using the CUTLASS 2.x API, for
    NVIDIA GPUs with SM versions prior to sm90 (Hopper).
 
-   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
-   per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
 */
 
 namespace {
@@ -83,27 +76,25 @@ struct enable_sm89_to_sm90 : Kernel {
   }
 };
 
-template <typename Arch, template <typename> typename ArchGuard,
-          typename ElementAB_, typename ElementD_, typename TileShape,
-          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
-struct cutlass_2x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch._scaled_mm.
 
-  using Operator =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
-                                cutlass::arch::OpMultiplyAddSaturate,
-                                cutlass::arch::OpMultiplyAdd>::type;
+   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+   per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
 
-  using OutputTileThreadMap =
-      cutlass::epilogue::threadblock::OutputTileThreadLayout<
-          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
-          >;
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
 
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue {
+ private:
   using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
 
   using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
@@ -123,14 +114,56 @@ struct cutlass_2x_gemm {
       cutlass::multiplies, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
-  using EVTCompute1 =
+ public:
+  using EVTCompute =
       cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_,
+          template <typename, typename> typename Epilogue_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
+struct cutlass_2x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Operator =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
+                                cutlass::arch::OpMultiplyAddSaturate,
+                                cutlass::arch::OpMultiplyAdd>::type;
+
+  using OutputTileThreadMap =
+      cutlass::epilogue::threadblock::OutputTileThreadLayout<
+          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
+          >;
+
+  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
+  using EVTCompute = typename Epilogue::EVTCompute;
 
   using D = cutlass::epilogue::threadblock::VisitorAuxStore<
       OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
       Stride<int64_t, Int<1>, Int<0>>>;
 
-  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute1>;
+  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
 
   // clang-format off
   using RowMajor = typename cutlass::layout::RowMajor;
@@ -153,11 +186,10 @@ struct cutlass_2x_gemm {
   using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
 };
 
-template <typename Gemm>
-void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     torch::Tensor const& a_scales,
-                                     torch::Tensor const& b_scales) {
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
@@ -177,23 +209,14 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
   auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
 
-  auto a_scales_ptr = a_scales.data_ptr<float>();
-  auto b_scales_ptr = b_scales.data_ptr<float>();
-
-  using ScaleAArgs = typename Gemm::ScaleA::Arguments;
-  using ScaleBArgs = typename Gemm::ScaleB::Arguments;
-
-  ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-  ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-
-  typename Gemm::EVTCompute0::Arguments evt0_compute_args{b_args};
-
-  typename Gemm::EVTCompute1::Arguments evt1_compute_args{a_args,
-                                                          evt0_compute_args};
   typename Gemm::D::Arguments d_args{c_ptr, c_stride};
 
+  using Epilogue = typename Gemm::Epilogue;
+  auto evt_args =
+      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
+
   typename Gemm::EVTD::Arguments epilogue_args{
-      evt1_compute_args,
+      evt_args,
       d_args,
   };
 
@@ -229,10 +252,10 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
 
 }  // namespace
 
-void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales) {
+void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales) {
   TORCH_CHECK(a.dtype() == torch::kInt8);
   TORCH_CHECK(b.dtype() == torch::kInt8);
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
@@ -243,23 +266,23 @@ void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a,
   using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+    return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        TileShape, WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
-                                                    b_scales);
+        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, a_scales, b_scales);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+    return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        TileShape, WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
-                                                    b_scales);
+        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, a_scales, b_scales);
   }
 }
 
-void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales) {
+void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales) {
   TORCH_CHECK(a.dtype() == torch::kInt8);
   TORCH_CHECK(b.dtype() == torch::kInt8);
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
@@ -270,23 +293,23 @@ void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a,
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+    return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::bfloat16_t,
-        TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                    b_scales);
+        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+        out, a, b, a_scales, b_scales);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+    return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::half_t,
-        TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                    b_scales);
+        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+        out, a, b, a_scales, b_scales);
   }
 }
 
-void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales) {
+void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales) {
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -298,32 +321,32 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+      return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                      b_scales);
+          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, a_scales, b_scales);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+      return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                      b_scales);
+          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, a_scales, b_scales);
     }
   } else {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+      return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::bfloat16_t, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          cutlass::bfloat16_t, ScaledEpilogue, TileShape, WarpShape,
+          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+      return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::half_t, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          cutlass::half_t, ScaledEpilogue, TileShape, WarpShape,
+          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
     }
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
similarity index 66%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
rename to csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index a99802153..f1a2b73ff 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -32,21 +32,14 @@
 using namespace cute;
 
 /*
-   This defines a quantized GEMM operation with dequantized output, similar to
-   torch._scaled_mm. It is defined using the CUTLASS 3.x API, and is used for
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper) or later.
 
-   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
-   per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
 */
 
 namespace {
@@ -71,21 +64,25 @@ struct enable_sm90_or_later : Kernel {
   }
 };
 
-template <typename ElementAB_, typename ElementD_, typename TileShape,
-          typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
 
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
 
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue {
+ private:
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
 
   using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
@@ -111,19 +108,53 @@ struct cutlass_3x_gemm {
       cutlass::multiplies, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
-  using EVTCompute1 =
+ public:
+  using EVTCompute =
       cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    using ScaleA_Args = typename ScaleA::Arguments;
+    using ScaleB_Args = typename ScaleB::Arguments;
+
+    ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+
+    return ArgumentType{a_args, {b_args}};
+  }
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
 
   using StrideD = Stride<int64_t, Int<1>, Int<0>>;
   using ElementC = void;
   using StrideC = StrideD;
 
+  using EVTCompute = typename Epilogue::EVTCompute;
+
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
           ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute1>::CollectiveOp;
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -148,11 +179,10 @@ struct cutlass_3x_gemm {
   struct GemmKernel : public KernelType {};
 };
 
-template <typename Gemm>
-void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     torch::Tensor const& a_scales,
-                                     torch::Tensor const& b_scales) {
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
@@ -182,19 +212,13 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
-      {}, c_ptr, c_stride, c_ptr, c_stride};
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
 
   typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
                                       prob_shape, mainloop_args, epilogue_args};
 
-  using ScaleA_Args = typename Gemm::ScaleA::Arguments;
-  using ScaleB_Args = typename Gemm::ScaleB::Arguments;
-
-  ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-  ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-
-  args.epilogue.thread = {a_args, {b_args}};
-
   // Launch the CUTLASS GEMM kernel.
   using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
   GemmOp gemm_op;
@@ -209,7 +233,8 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
-template <typename InType, typename OutType, int32_t M>
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue, int32_t M>
 struct sm90_fp8_config {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
@@ -219,12 +244,13 @@ struct sm90_fp8_config {
   using ClusterShape = Shape<_2, _1, _1>;
 
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
 };
 
-template <typename InType, typename OutType>
-struct sm90_fp8_config<InType, OutType, 128> {
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config<InType, OutType, Epilogue, 128> {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
@@ -233,12 +259,13 @@ struct sm90_fp8_config<InType, OutType, 128> {
   using ClusterShape = Shape<_2, _1, _1>;
 
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
 };
 
-template <typename InType, typename OutType>
-struct sm90_fp8_config<InType, OutType, 64> {
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config<InType, OutType, Epilogue, 64> {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
@@ -247,30 +274,28 @@ struct sm90_fp8_config<InType, OutType, 64> {
   using ClusterShape = Shape<_1, _8, _1>;
 
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
 };
 
 }  // namespace
 
-template <typename InType, typename OutType>
-void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out,
-                                            torch::Tensor const& a,
-                                            torch::Tensor const& b,
-                                            torch::Tensor const& a_scales,
-                                            torch::Tensor const& b_scales) {
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& b,
+                                    EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   using Cutlass3xGemmDefault =
-      typename sm90_fp8_config<InType, OutType, 0>::Cutlass3xGemm;
+      typename sm90_fp8_config<InType, OutType, Epilogue, 0>::Cutlass3xGemm;
   using Cutlass3xGemmM64 =
-      typename sm90_fp8_config<InType, OutType, 64>::Cutlass3xGemm;
+      typename sm90_fp8_config<InType, OutType, Epilogue, 64>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
-      typename sm90_fp8_config<InType, OutType, 128>::Cutlass3xGemm;
+      typename sm90_fp8_config<InType, OutType, Epilogue, 128>::Cutlass3xGemm;
 
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
@@ -278,23 +303,23 @@ void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out,
 
   if (mp2 <= 64) {
     // m in [1, 64]
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmM64>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // m in (64, 128]
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmM128>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmDefault>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
   }
 }
 
-void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales) {
+void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
@@ -308,16 +333,15 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
     using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<
-          cutlass_3x_gemm<int8_t, cutlass::bfloat16_t, TileShape, ClusterShape,
-                          KernelSchedule, EpilogueSchedule>>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_gemm_caller<cutlass_3x_gemm<
+          int8_t, cutlass::bfloat16_t, ScaledEpilogue, TileShape, ClusterShape,
+          KernelSchedule, EpilogueSchedule>>(out, a, b, a_scales, b_scales);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
 
-      return cutlass_scaled_mm_dq_dispatcher<
-          cutlass_3x_gemm<int8_t, cutlass::half_t, TileShape, ClusterShape,
-                          KernelSchedule, EpilogueSchedule>>(
+      return cutlass_gemm_caller<
+          cutlass_3x_gemm<int8_t, cutlass::half_t, ScaledEpilogue, TileShape,
+                          ClusterShape, KernelSchedule, EpilogueSchedule>>(
           out, a, b, a_scales, b_scales);
     }
   } else {
@@ -325,13 +349,13 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                                    cutlass::bfloat16_t>(
+      return cutlass_gemm_sm90_fp8_dispatch<
+          cutlass::float_e4m3_t, cutlass::bfloat16_t, ScaledEpilogue>(
           out, a, b, a_scales, b_scales);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                                    cutlass::half_t>(
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, ScaledEpilogue>(
           out, a, b, a_scales, b_scales);
     }
   }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
similarity index 50%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
rename to csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 423e64a49..687f8efd8 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -3,31 +3,31 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 
-void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales);
+void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales);
 
-void cutlass_scaled_mm_dq_sm80(torch::Tensor& c, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales);
+void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales);
 
-void cutlass_scaled_mm_dq_sm89(torch::Tensor& c, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales);
+void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-void cutlass_scaled_mm_dq_sm90(torch::Tensor& c, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales);
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales);
 #endif
 
-void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a,
-                          torch::Tensor const& b, torch::Tensor const& a_scales,
-                          torch::Tensor const& b_scales) {
+void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales) {
   int32_t major_capability;
   int32_t minor_capability;
   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
@@ -57,19 +57,19 @@ void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a,
 
     // Guard against compilation issues for sm90 kernels
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-    cutlass_scaled_mm_dq_sm90(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales);
 #else
-    cutlass_scaled_mm_dq_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
 #endif
   } else if (version_num == 89) {
     // Ada Lovelace
-    cutlass_scaled_mm_dq_sm89(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales);
   } else if (version_num >= 80) {
     // Ampere
-    cutlass_scaled_mm_dq_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
   } else {
     // Turing
     TORCH_CHECK(version_num >= 75);
-    cutlass_scaled_mm_dq_sm75(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index df2603544..867bf4389 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -136,10 +136,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization.
   ops.def(
-      "cutlass_scaled_mm_dq(Tensor! out, Tensor a,"
-      "                     Tensor b, Tensor a_scales,"
-      "                     Tensor b_scales) -> ()");
-  ops.impl("cutlass_scaled_mm_dq", torch::kCUDA, &cutlass_scaled_mm_dq);
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales) -> ()");
+  ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 079d9650c..777138ace 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int,
     scale_b = (torch.randn(
         (1, n_b_scales), device=device, dtype=torch.float32) / 10)
 
-    out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
     baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                         scale_b * b.to(dtype=torch.float32)).to(out_dtype)
 
@@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int,
     scale_b = (torch.randn(
         (1, n_b_scales), device=device, dtype=torch.float32) / 10)
 
-    out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
     baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                         scale_b *
                         b.to(dtype=torch.float32)).to(dtype=out_dtype)
@@ -180,11 +180,11 @@ def test_cutlass_subset():
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
 
-    out = ops.cutlass_scaled_mm_dq(a,
-                                   b,
-                                   scale_a,
-                                   scale_b,
-                                   out_dtype=torch.bfloat16)
+    out = ops.cutlass_scaled_mm(a,
+                                b,
+                                scale_a,
+                                scale_b,
+                                out_dtype=torch.bfloat16)
     baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                         scale_b *
                         b.to(dtype=torch.float32)).to(dtype=torch.bfloat16)
@@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module):
         self.out_dtype = out_dtype
 
     def forward(self, a):
-        return ops.cutlass_scaled_mm_dq(a, self.b, self.scale_a, self.scale_b,
-                                        self.out_dtype)
+        return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
+                                     self.out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 955086be1..2f84b8bde 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -212,9 +212,9 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # cutlass
-def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
-                         scale_a: torch.Tensor, scale_b: torch.Tensor,
-                         out_dtype: Type[torch.dtype]) -> torch.Tensor:
+def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                      scale_b: torch.Tensor,
+                      out_dtype: Type[torch.dtype]) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
 
@@ -222,8 +222,7 @@ def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b)
-
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b)
     return out
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index 25b707cae..9bb7bf447 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -81,5 +81,5 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
         weight_scale = layer.weight_scale
 
         x_q, input_scales = custom_ops.scaled_int8_quant(x)
-        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), input_scales,
-                                               weight_scale, x.dtype)
+        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), input_scales,
+                                            weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
index 7559fc0f9..88c15c5c2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -99,5 +99,5 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
         # Input quantize
         x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)
 
-        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale,
-                                               weight_scale, x.dtype)
+        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), act_scale,
+                                            weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0cf2bd927..e89fd6581 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -261,7 +261,7 @@ class Fp8LinearMethod(LinearMethodBase):
             qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
 
             # Fused GEMM_DQ
-            output = ops.cutlass_scaled_mm_dq(
+            output = ops.cutlass_scaled_mm(
                 qinput,
                 layer.weight,
                 out_dtype=x.dtype,
-- 
GitLab


From 30299a41fa78c7bf485aca7ef8ad584ca340a64d Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 13 Jun 2024 11:22:30 -0700
Subject: [PATCH 027/376] [MISC] Remove FP8 warning (#5472)

Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 2513d43ce..76c10d464 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -212,7 +212,7 @@ class ModelConfig:
                     f"{self.quantization} quantization is currently not "
                     f"supported in ROCm.")
             if (self.quantization
-                    not in ["marlin", "gptq_marlin_24", "gptq_marlin"]):
+                    not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin")):
                 logger.warning(
                     "%s quantization is not fully "
                     "optimized yet. The speed can be slower than "
-- 
GitLab


From a8fda4f66131e211ac1e64f6b1d74123e0347a1c Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 13 Jun 2024 11:22:41 -0700
Subject: [PATCH 028/376] Seperate dev requirements into lint and test (#5474)

---
 Dockerfile            |  2 ++
 requirements-dev.txt  | 40 ++++------------------------------------
 requirements-lint.txt | 14 ++++++++++++++
 requirements-test.txt | 22 ++++++++++++++++++++++
 4 files changed, 42 insertions(+), 36 deletions(-)
 create mode 100644 requirements-lint.txt
 create mode 100644 requirements-test.txt

diff --git a/Dockerfile b/Dockerfile
index 62c401069..72894e7ca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,6 +27,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-cuda.txt
 
 # install development dependencies
+COPY requirements-lint.txt requirements-lint.txt
+COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b380ef205..421aa2e79 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,37 +1,5 @@
-# formatting
-yapf==0.32.0
-toml==0.10.2
-tomli==2.0.1
-ruff==0.1.5
-codespell==2.3.0
-isort==5.13.2
-clang-format==18.1.5
+-r requirements-lint.txt
+-r requirements-test.txt
 
-# type checking
-mypy==1.9.0
-types-PyYAML
-types-requests
-types-setuptools
-
-# testing
-pytest
-tensorizer>=2.9.0
-pytest-forked
-pytest-asyncio
-pytest-rerunfailures
-pytest-shard
-
-# testing utils
-awscli
-einops # required for MPT
-httpx
-peft
-requests
-ray
-sentence-transformers # required for embedding
-
-# Benchmarking
-aiohttp
-
-# quantization
-bitsandbytes==0.42.0
+# Avoid adding requirements directly to this file.
+# Instead, modify the two files referenced above.
diff --git a/requirements-lint.txt b/requirements-lint.txt
new file mode 100644
index 000000000..bd34227d3
--- /dev/null
+++ b/requirements-lint.txt
@@ -0,0 +1,14 @@
+# formatting
+yapf==0.32.0
+toml==0.10.2
+tomli==2.0.1
+ruff==0.1.5
+codespell==2.3.0
+isort==5.13.2
+clang-format==18.1.5
+
+# type checking
+mypy==1.9.0
+types-PyYAML
+types-requests
+types-setuptools
diff --git a/requirements-test.txt b/requirements-test.txt
new file mode 100644
index 000000000..8b68e0e93
--- /dev/null
+++ b/requirements-test.txt
@@ -0,0 +1,22 @@
+# testing
+pytest
+tensorizer>=2.9.0
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+
+# testing utils
+awscli
+einops # required for MPT
+httpx
+peft
+requests
+ray
+sentence-transformers # required for embedding
+
+# Benchmarking
+aiohttp
+
+# quantization
+bitsandbytes==0.42.0
-- 
GitLab


From 6b0511a57bdba85efe2b4d5588dd16280c8fdc78 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 13 Jun 2024 11:22:50 -0700
Subject: [PATCH 029/376] Revert "[Core] Remove unnecessary copies in flash
 attn backend" (#5478)

---
 vllm/attention/backends/flash_attn.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 8c64c2bfd..300bab728 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -317,7 +317,7 @@ class FlashAttentionImpl(AttentionImpl):
                 # normal attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
-                flash_attn_varlen_func(
+                out = flash_attn_varlen_func(
                     q=query,
                     k=key,
                     v=value,
@@ -329,13 +329,14 @@ class FlashAttentionImpl(AttentionImpl):
                     causal=True,
                     window_size=self.sliding_window,
                     alibi_slopes=self.alibi_slopes,
-                    out=output[:num_prefill_tokens],
                 )
+                assert output[:num_prefill_tokens].shape == out.shape
+                output[:num_prefill_tokens] = out
             else:
                 # prefix-enabled attention
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
-                flash_attn_varlen_func(
+                output[:num_prefill_tokens] = flash_attn_varlen_func(
                     q=query,
                     k=key_cache,
                     v=value_cache,
@@ -347,12 +348,11 @@ class FlashAttentionImpl(AttentionImpl):
                     causal=True,
                     alibi_slopes=self.alibi_slopes,
                     block_table=prefill_meta.block_tables,
-                    out=output[:num_prefill_tokens],
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
-            flash_attn_with_kvcache(
+            output[num_prefill_tokens:] = flash_attn_with_kvcache(
                 decode_query.unsqueeze(1),
                 key_cache,
                 value_cache,
@@ -361,8 +361,7 @@ class FlashAttentionImpl(AttentionImpl):
                 softmax_scale=self.scale,
                 causal=True,
                 alibi_slopes=self.alibi_slopes,
-                out=output[num_prefill_tokens:].unsqueeze(1),
-            )
+            ).squeeze(1)
 
         # Reshape the output tensor.
         return output.view(num_tokens, hidden_size)
-- 
GitLab


From 1696efe6c91a82e1aca5b49f4bc7899802115981 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 13 Jun 2024 12:09:16 -0700
Subject: [PATCH 030/376] [misc] fix format.sh (#5511)

---
 format.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/format.sh b/format.sh
index 6057b69af..2fd6af03b 100755
--- a/format.sh
+++ b/format.sh
@@ -36,12 +36,12 @@ tool_version_check() {
     fi
 }
 
-tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-lint.txt | cut -d'=' -f3)"
 
 YAPF_FLAGS=(
     '--recursive'
-- 
GitLab


From 33e3b372429232cea44266d866906effaa705a10 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 13 Jun 2024 16:37:48 -0400
Subject: [PATCH 031/376] [CI/Build] Disable test_fp8.py (#5508)

---
 tests/models/test_fp8.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index b24c17cf3..2b5609188 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -68,6 +68,14 @@ EXPECTED_STRS_MAP = {
 }
 
 
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp8 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build.")
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
-- 
GitLab


From e38042d4af1ddb390c3dd9340250de25bee37c62 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 13 Jun 2024 16:38:05 -0400
Subject: [PATCH 032/376] [Kernel] Disable CUTLASS kernels for fp8 (#5505)

---
 vllm/model_executor/layers/quantization/fp8.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e89fd6581..bc08bfcc3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -257,7 +257,9 @@ class Fp8LinearMethod(LinearMethodBase):
         #   If dynamic, layer.input_scale is None and x_scale computed from x.
         #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
-        if bias is None and self.cutlass_fp8_supported:
+        # Temporarily disable CUTLASS kernels due to an illegal memory access
+        #if  bias is None and self.cutlass_fp8_supported:
+        if False:
             qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
 
             # Fused GEMM_DQ
-- 
GitLab


From 50eed24d252965a81ce50b64fd387d60fb1f4f6e Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 13 Jun 2024 16:06:49 -0700
Subject: [PATCH 033/376] Add `cuda_device_count_stateless` (#5473)

---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/conftest.py                             | 17 ++-------
 tests/distributed/test_utils.py               | 31 ++++++++++++++++
 vllm/config.py                                |  6 ++--
 .../device_communicators/custom_all_reduce.py |  3 +-
 .../custom_all_reduce_utils.py                |  3 +-
 vllm/executor/multiproc_gpu_executor.py       |  6 ++--
 vllm/utils.py                                 | 35 +++++++++++++++++++
 8 files changed, 79 insertions(+), 23 deletions(-)
 create mode 100644 tests/distributed/test_utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6b12d19ba..6a2932db9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -48,6 +48,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - pytest -v -s spec_decode/e2e/test_integration_dist.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
 - label: Distributed Tests (Multiple Groups)
   #mirror_hardwares: [amd]
diff --git a/tests/conftest.py b/tests/conftest.py
index 29a4f126f..18aea3702 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,8 +1,6 @@
 import contextlib
 import gc
 import os
-import subprocess
-import sys
 from typing import Any, Dict, List, Optional, Tuple, TypeVar
 
 import pytest
@@ -22,7 +20,7 @@ from vllm.logger import init_logger
 from vllm.multimodal import MultiModalData
 from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu
+from vllm.utils import cuda_device_count_stateless, is_cpu
 
 logger = init_logger(__name__)
 
@@ -539,15 +537,4 @@ def num_gpus_available():
     """Get number of GPUs without initializing the CUDA context
     in current process."""
 
-    try:
-        out = subprocess.run([
-            sys.executable, "-c",
-            "import torch; print(torch.cuda.device_count())"
-        ],
-                             capture_output=True,
-                             check=True,
-                             text=True)
-    except subprocess.CalledProcessError as e:
-        logger.warning("Failed to get number of GPUs.", exc_info=e)
-        return 0
-    return int(out.stdout.strip())
+    return cuda_device_count_stateless()
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
new file mode 100644
index 000000000..b7ec59c7a
--- /dev/null
+++ b/tests/distributed/test_utils.py
@@ -0,0 +1,31 @@
+import os
+
+import ray
+
+from vllm.utils import cuda_device_count_stateless
+
+
+@ray.remote
+class _CUDADeviceCountStatelessTestActor():
+
+    def get_count(self):
+        return cuda_device_count_stateless()
+
+    def set_cuda_visible_devices(self, cuda_visible_devices: str):
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+
+    def get_cuda_visible_devices(self):
+        return os.environ["CUDA_VISIBLE_DEVICES"]
+
+
+def test_cuda_device_count_stateless():
+    """Test that cuda_device_count_stateless changes return value if
+    CUDA_VISIBLE_DEVICES is changed."""
+
+    actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
+    assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1"
+    assert ray.get(actor.get_count.remote()) == 2
+    ray.get(actor.set_cuda_visible_devices.remote("0"))
+    assert ray.get(actor.get_count.remote()) == 1
+    ray.get(actor.set_cuda_visible_devices.remote(""))
+    assert ray.get(actor.get_count.remote()) == 0
diff --git a/vllm/config.py b/vllm/config.py
index 76c10d464..d9e4a619e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -11,7 +11,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
 from vllm.transformers_utils.config import get_config, get_hf_text_config
-from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron, is_tpu
+from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
+                        is_hip, is_neuron, is_tpu)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -605,12 +606,11 @@ class ParallelConfig:
         if self.distributed_executor_backend is None and self.world_size > 1:
             # We use multiprocessing by default if world_size fits on the
             # current node and we aren't in a ray placement group.
-            from torch.cuda import device_count
 
             from vllm.executor import ray_utils
             backend = "mp"
             ray_found = ray_utils.ray is not None
-            if device_count() < self.world_size:
+            if cuda_device_count_stateless() < self.world_size:
                 if not ray_found:
                     raise ValueError("Unable to load Ray which is "
                                      "required for multi-node inference")
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 9a2b47594..b0cb21a02 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -11,6 +11,7 @@ from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import is_in_the_same_node
 from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless
 
 try:
     import pynvml
@@ -144,7 +145,7 @@ class CustomAllreduce:
         if cuda_visible_devices:
             device_ids = list(map(int, cuda_visible_devices.split(",")))
         else:
-            device_ids = list(range(torch.cuda.device_count()))
+            device_ids = list(range(cuda_device_count_stateless()))
 
         physical_device_id = device_ids[device.index]
         tensor = torch.tensor([physical_device_id],
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 1fd0058f6..c9573edb0 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -12,6 +12,7 @@ import torch.multiprocessing as mp
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless
 
 logger = init_logger(__name__)
 
@@ -152,7 +153,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
 
     is_distributed = dist.is_initialized()
 
-    num_dev = torch.cuda.device_count()
+    num_dev = cuda_device_count_stateless()
     cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     if cuda_visible_devices is None:
         cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 99c9e5203..8385e56f8 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -9,7 +9,8 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (cuda_device_count_stateless,
+                        get_distributed_init_method, get_ip, get_open_port,
                         get_vllm_instance_id, make_async)
 
 logger = init_logger(__name__)
@@ -33,8 +34,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
-        from torch.cuda import device_count
-        assert world_size <= device_count(), (
+        assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")
 
         distributed_init_method = get_distributed_init_method(
diff --git a/vllm/utils.py b/vllm/utils.py
index af585929d..b5c42605b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -693,3 +693,38 @@ def deprecate_kwargs(
         return inner  # type: ignore
 
     return wrapper
+
+
+@lru_cache(maxsize=8)
+def _cuda_device_count_stateless(
+        cuda_visible_devices: Optional[str] = None) -> int:
+    # Note: cuda_visible_devices is not used, but we keep it as an argument for
+    # LRU Cache purposes.
+
+    # Code below is based on
+    # https://github.com/pytorch/pytorch/blob/
+    # c1cd946818442aca8c7f812b16d187ce1586c3bc/
+    # torch/cuda/__init__.py#L831C1-L831C17
+    import torch.cuda
+    import torch.version
+
+    if not torch.cuda._is_compiled():
+        return 0
+    # bypass _device_count_nvml() if rocm (not supported)
+    nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml()
+    r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+    return r
+
+
+def cuda_device_count_stateless() -> int:
+    """Get number of CUDA devices, caching based on the value of
+    CUDA_VISIBLE_DEVICES at the time of call.
+    
+    This should be used instead of torch.cuda.device_count()
+    unless CUDA_VISIBLE_DEVICES has already been set to the desired
+    value."""
+
+    # This can be removed and simply replaced with torch.cuda.get_device_count
+    # after https://github.com/pytorch/pytorch/pull/122815 is released.
+
+    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
-- 
GitLab


From cd9c0d65d98f86fbd2235ee41b80107097a57f77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Fri, 14 Jun 2024 07:22:24 +0800
Subject: [PATCH 034/376] [Hardware][Intel] Support CPU inference with AVX2 ISA
 (#5452)

---
 cmake/cpu_extension.cmake |   6 +-
 csrc/cpu/cpu_types.hpp    | 165 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 169 insertions(+), 2 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 61d484383..a644e5b6a 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -33,6 +33,7 @@ function (find_isa CPUINFO TARGET OUT)
     endif()
 endfunction()
 
+find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 
 if (AVX512_FOUND)
@@ -53,8 +54,11 @@ if (AVX512_FOUND)
     else()
         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
     endif()
+elseif (AVX2_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
+    message(WARNING "vLLM CPU backend using AVX2 ISA")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.")
 endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 034c406a5..d7621aaae 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -5,6 +5,10 @@
 #include <immintrin.h>
 #include <torch/all.h>
 
+#ifndef __AVX2__
+static_assert(false, "AVX2 must be supported for the current implementation.");
+#endif
+
 namespace vec_op {
 
 // FIXME: FP16 is not fully supported in Torch-CPU
@@ -104,6 +108,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
 };
 
+#ifdef __AVX512F__
 struct BF16Vec32 : public Vec<BF16Vec32> {
   constexpr static int VEC_ELEM_NUM = 32;
 
@@ -123,6 +128,34 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
 
   void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
 };
+#else
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m256i reg_low;
+  __m256i reg_high;
+
+  explicit BF16Vec32(const void *ptr)
+      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
+        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
+
+  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
+                                                  reg_high(high) {}
+
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg_low((__m256i)_mm256_inserti32x4(
+                _mm256_castsi128_si256((__m128i)vec8_data.reg),
+                                       (__m128i)vec8_data.reg, 1)),
+        reg_high((__m256i)_mm256_inserti32x4(
+                _mm256_castsi128_si256((__m128i)vec8_data.reg),
+                                       (__m128i)vec8_data.reg, 1)) {}
+
+  void save(void *ptr) const {
+    *reinterpret_cast<__m256i *>(ptr) = reg_low;
+    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
+  }
+};
+#endif
 
 struct FP32Vec4 : public Vec<FP32Vec4> {
   constexpr static int VEC_ELEM_NUM = 4;
@@ -226,6 +259,7 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
 };
 
+#ifdef __AVX512F__
 struct FP32Vec16 : public Vec<FP32Vec16> {
   constexpr static int VEC_ELEM_NUM = 16;
   union AliasReg {
@@ -290,6 +324,114 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
 };
+#else
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  union AliasReg {
+    __m256 reg;
+    float values[8];
+  };
+
+  __m256 reg_low;
+  __m256 reg_high;
+
+  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
+                                reg_high(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
+                         reg_high(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
+                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
+
+  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
+                                              reg_high(data.reg_high) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg_low((__m256)_mm256_inserti128_si256(
+                _mm256_castsi128_si256((__m128i)data.reg),
+                                       (__m128i)data.reg, 1)),
+        reg_high((__m256)_mm256_inserti128_si256(
+                 _mm256_castsi128_si256((__m128i)data.reg),
+                                       (__m128i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg_low(data.reg), reg_high(data.reg) {}
+
+  explicit FP32Vec16(const BF16Vec16 &v) {
+    __m128i low = _mm256_extractf128_si256(v.reg, 0);
+    __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
+    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
+
+    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
+    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
+
+    reg_low = _mm256_castsi256_ps(v_low_shifted);
+    reg_high = _mm256_castsi256_ps(v_high_shifted);
+  }
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
+                     _mm256_mul_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
+                     _mm256_add_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
+                     _mm256_sub_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
+                     _mm256_div_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_sum() const {
+    FP32Vec8 low = FP32Vec8(reg_low);
+    FP32Vec8 high = FP32Vec8(reg_high);
+    return low.reduce_sum() + high.reduce_sum();
+  }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    float sum = 0.0;
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    uint32_t mask = base_mask << (idx * group_size);
+
+    AliasReg ar;
+
+    auto func = [&sum, &mask, &ar](int i) {
+      int flag = mask & 0x1;
+      mask = mask >> 1;
+      if (flag != 0) sum += ar.values[i];
+    };
+
+    ar.reg = reg_low;
+    unroll_loop<int, 8>(func);
+
+    ar.reg = reg_high;
+    unroll_loop<int, 8>(func);
+
+    return sum;
+  }
+
+  void save(float *ptr) const {
+    _mm256_storeu_ps(ptr, reg_low);
+    _mm256_storeu_ps(ptr + 8, reg_high);
+  }
+};
+#endif
 
 template <typename T> struct VecType { using vec_type = void; };
 
@@ -336,6 +478,7 @@ template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
   *ptr = *(v_ptr + 1);
 }
 
+#ifdef __AVX512F__
 inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
     : reg(_mm256_cvtepi32_epi16(
           _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
@@ -343,7 +486,27 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
 inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
     : reg(_mm512_cvtepi32_epi16(
           _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
-#endif
+#else
+namespace{
+__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
+  __m256i ai = _mm256_castps_si256(a);
+  ai = _mm256_srli_epi32(ai, 16);
+  ai = _mm256_packus_epi32(ai, ai);
+  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
+  return _mm256_extracti128_si256(ai, 0);
+}
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
+  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
+  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
+}
+#endif // __AVX512F__
+#endif // __AVX512BF16__
 
 inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
 
-- 
GitLab


From 55d6361b13ae6328de809f57a69b719c1600040a Mon Sep 17 00:00:00 2001
From: "Allen.Dou" <allen.dou@hotmail.com>
Date: Fri, 14 Jun 2024 10:02:53 +0800
Subject: [PATCH 035/376] [Misc] Fix arg names in quantizer script (#5507)

---
 examples/fp8/quantizer/quantize.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/fp8/quantizer/quantize.py b/examples/fp8/quantizer/quantize.py
index cee13b4c9..15f1a06b1 100644
--- a/examples/fp8/quantizer/quantize.py
+++ b/examples/fp8/quantizer/quantize.py
@@ -332,7 +332,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--model_dir",
+    parser.add_argument("--model-dir",
                         help="Specify where the HuggingFace model is",
                         required=True)
     parser.add_argument("--device", default="cuda")
@@ -346,19 +346,19 @@ if __name__ == "__main__":
             "full_prec"
         ],
     )
-    parser.add_argument("--batch_size",
+    parser.add_argument("--batch-size",
                         help="Batch size for calibration.",
                         type=int,
                         default=1)
-    parser.add_argument("--calib_size",
+    parser.add_argument("--calib-size",
                         help="Number of samples for calibration.",
                         type=int,
                         default=512)
-    parser.add_argument("--output_dir", default="exported_model")
-    parser.add_argument("--tp_size", type=int, default=1)
-    parser.add_argument("--pp_size", type=int, default=1)
-    parser.add_argument("--awq_block_size", type=int, default=128)
-    parser.add_argument("--kv_cache_dtype",
+    parser.add_argument("--output-dir", default="exported_model")
+    parser.add_argument("--tp-size", type=int, default=1)
+    parser.add_argument("--pp-size", type=int, default=1)
+    parser.add_argument("--awq-block-size", type=int, default=128)
+    parser.add_argument("--kv-cache-dtype",
                         help="KV Cache dtype.",
                         default=None,
                         choices=["int8", "fp8", None])
-- 
GitLab


From 0f0d8bc065f3608e7657a9696f5d2d7c0d6722d1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 13 Jun 2024 21:42:06 -0500
Subject: [PATCH 036/376] bump version to v0.5.0.post1 (#5522)

---
 vllm/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/version.py b/vllm/version.py
index 3d187266f..2b33ffcf5 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1 +1 @@
-__version__ = "0.5.0"
+__version__ = "0.5.0.post1"
-- 
GitLab


From 319ad7f1d386699e94f629341c9988a926821f24 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 13 Jun 2024 22:36:20 -0700
Subject: [PATCH 037/376] [CI/Build][Misc] Add CI that benchmarks vllm
 performance on those PRs with `perf-benchmarks` label (#5073)

Co-authored-by: simon-mo <simon.mo@hey.com>
---
 .buildkite/nightly-benchmarks/README.md       |  98 +++++
 .../benchmark-pipeline.yaml                   |  61 +++
 .../nightly-benchmarks/kickoff-pipeline.sh    |   3 +-
 .../nightly-benchmarks/latency-tests.json     |  32 ++
 .../run-benchmarks-suite.sh                   | 358 ++++++++++++++++++
 .buildkite/nightly-benchmarks/sample.yaml     |  39 --
 .../convert-results-json-to-markdown.py       | 155 ++++++++
 .../scripts/wait-for-image.sh                 |  17 +
 .../nightly-benchmarks/serving-tests.json     |  59 +++
 .../nightly-benchmarks/throughput-tests.json  |  35 ++
 benchmarks/benchmark_latency.py               |  25 ++
 benchmarks/benchmark_serving.py               |  11 +
 benchmarks/benchmark_throughput.py            |  28 +-
 13 files changed, 880 insertions(+), 41 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/README.md
 create mode 100644 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
 create mode 100644 .buildkite/nightly-benchmarks/latency-tests.json
 create mode 100644 .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
 delete mode 100644 .buildkite/nightly-benchmarks/sample.yaml
 create mode 100644 .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
 create mode 100644 .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
 create mode 100644 .buildkite/nightly-benchmarks/serving-tests.json
 create mode 100644 .buildkite/nightly-benchmarks/throughput-tests.json

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
new file mode 100644
index 000000000..6a18be947
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -0,0 +1,98 @@
+# vLLM benchmark suite
+
+## Introduction
+
+This directory contains the performance benchmarking CI for vllm.
+The goal is to help developers know the impact of their PRs on the performance of vllm.
+
+This benchmark will be *triggered* upon:
+- A PR being merged into vllm.
+- Every commit for those PRs with `perf-benchmarks` label.
+
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
+
+**Benchmarking Duration**: about 1hr.
+
+## Configuring the workload for the quick benchmark
+
+The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`.
+
+### Latency test
+
+Here is an example of one test inside `latency-tests.json`:
+
+```json
+[
+    ...
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    ...
+]
+```
+
+In this example:
+-  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+
+Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
+
+WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
+
+
+### Throughput test
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+
+The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
+
+### Serving test
+We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+
+```
+[
+    ...
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    ...
+]
+```
+
+Inside this example:
+- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
+- The `server-parameters` includes the command line arguments for vLLM server.
+- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+
+The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
+
+WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+
+## Visualizing the results
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table.
+You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
+If you do not see the table, please wait till the benchmark finish running.
+The JSON file is also attached within each buildkite job for further analysis.
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
new file mode 100644
index 000000000..8f12748b6
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -0,0 +1,61 @@
+steps:
+  - label: "Wait for container to be ready"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: badouralix/curl-jq
+            command:
+            - sh
+            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+  - wait
+  - label: "A100 Benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  # - label: "H100: NVIDIA SMI"
+  #   agents:
+  #     queue: H100
+  #   plugins:
+  #   - docker#v5.11.0:
+  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #       command:
+  #       - bash
+  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+  #       mount-buildkite-agent: true
+  #       propagate-environment: true
+  #       propagate-uid-gid: false
+  #       ipc: host
+  #       gpus: all
+  #       environment:
+  #       - VLLM_USAGE_SOURCE
+  #       - HF_TOKEN
+
diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index d3bf3b729..15d411feb 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env bash
 
+# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
 set -euo pipefail
 
 # Install system packages
@@ -23,4 +24,4 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
 fi
 
 # Upload sample.yaml
-buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
+buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
diff --git a/.buildkite/nightly-benchmarks/latency-tests.json b/.buildkite/nightly-benchmarks/latency-tests.json
new file mode 100644
index 000000000..294a8c439
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/latency-tests.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
new file mode 100644
index 000000000..6cff6917f
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -0,0 +1,358 @@
+#!/bin/bash
+
+# This script should be run inside the CI process
+# This script assumes that we are already inside the vllm/ directory
+# Benchmarking results will be available inside vllm/benchmarks/results/
+
+# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
+# and we still want to see other benchmarking results even when mixtral crashes.
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl localhost:8000/v1/completions; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
+  if [ -z "$pids" ]; then
+      echo "No GPU processes found."
+  else
+      for pid in $pids; do
+          kill -9 "$pid"
+          echo "Killed process with PID: $pid"
+      done
+
+      echo "All GPU processes have been killed."
+  fi
+
+  # waiting for GPU processes to be fully killed
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+run_latency_tests() {
+  # run latency tests using `benchmark_latency.py`
+  # $1: a json file specifying latency test cases
+
+  local latency_test_file
+  latency_test_file=$1
+
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      continue
+    fi
+
+    latency_command="python3 benchmark_latency.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $latency_args"
+
+    echo "Running test case $test_name"
+    echo "Latency command: $latency_command"
+
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg latency "$latency_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        latency_command: $latency,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$latency_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+
+run_throughput_tests() {
+  # run throughput tests using `benchmark_throughput.py`
+  # $1: a json file specifying throughput test cases
+
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+
+    # check if there is enough GPU to run the test
+    tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      continue
+    fi
+
+    throughput_command="python3 benchmark_throughput.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+    # recoding benchmarking command ang GPU command
+    jq_output=$(jq -n \
+      --arg command "$throughput_command" \
+      --arg gpu "$gpu_type" \
+      '{
+        throughput_command: $command,
+        gpu_type: $gpu
+      }')
+    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
+
+    # run the benchmark
+    eval "$throughput_command"
+
+    kill_gpu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^serving_ ]]; then
+      echo "In serving-test.json, test_name must start with \"serving_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.server_parameters')
+    client_params=$(echo "$params" | jq -r '.client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      continue
+    fi
+
+    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
+    client_model=$(echo "$client_params" | jq -r '.model')
+    if [[ $server_model != "$client_model" ]]; then
+      echo "Server model and client model must be the same. Skip testcase $testname."
+      continue
+    fi
+
+    server_command="python3 \
+      -m vllm.entrypoints.openai.api_server \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu
+        }')
+      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+  done
+}
+
+main() {
+  check_gpus
+  check_hf_token
+
+  # dependencies
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get update && apt-get -y install jq)
+
+  # get the current IP address, required by benchmark_serving.py
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  # turn of the reporting of the status of each request, to clean up the terminal output
+  export VLLM_LOG_LEVEL="WARNING"
+
+  # prepare for benchmarking
+  cd benchmarks || exit 1
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # benchmarking
+  run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json
+
+
+  # postprocess benchmarking results
+  pip install tabulate pandas
+  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+
+  upload_to_buildkite
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/sample.yaml b/.buildkite/nightly-benchmarks/sample.yaml
deleted file mode 100644
index 50e6e8207..000000000
--- a/.buildkite/nightly-benchmarks/sample.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-steps:
-  # NOTE(simon): You can create separate blocks for different jobs
-  - label: "A100: NVIDIA SMI"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
-          # TODO(simon): check latest main branch or use the PR image.
-          - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
-            command:
-            - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  # TODO(simon): bring H100 online
-  # - label: "H100: NVIDIA SMI"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
-  #       command:
-  #       - bash -c 'nvidia-smi && nvidia-smi topo -m'
-  #       propagate-environment: true
-  #       ipc: host
-  #       gpus: all
-
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
new file mode 100644
index 000000000..75cff8434
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -0,0 +1,155 @@
+import json
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# latency results and the keys that will be printed into markdown
+latency_results = []
+latency_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "avg_latency": "Average latency (s)",
+    "P10": "P10 (s)",
+    "P25": "P25 (s)",
+    "P50": "P50 (s)",
+    "P75": "P75 (s)",
+    "P90": "P90 (s)",
+}
+
+# thoughput tests and the keys that will be printed into markdown
+throughput_results = []
+throughput_results_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "num_requests": "# of req.",
+    "total_num_tokens": "Total # of tokens",
+    "elapsed_time": "Elapsed time (s)",
+    "requests_per_second": "Tput (req/s)",
+    "tokens_per_second": "Tput (tok/s)",
+}
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "# of req.",
+    "request_throughput": "Tput (req/s)",
+    "input_throughput": "Input Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    # do not say TTFT again to avoid the table getting too wide
+    "median_ttft_ms": "Median",
+    "p99_ttft_ms": "P99",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "median_tpot_ms": "Median",
+    "p99_tpot_ms": "P99",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "median_itl_ms": "Median",
+    "p99_itl_ms": "P99",
+}
+
+for test_file in results_folder.glob("*.json"):
+
+    with open(test_file, "r") as f:
+        raw_result = json.loads(f.read())
+
+    if "serving" in str(test_file):
+        # this result is generated via `benchmark_serving.py`
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands"), "r") as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+    elif "latency" in f.name:
+        # this result is generated via `benchmark_latency.py`
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands"), "r") as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # get different percentiles
+        for perc in [10, 25, 50, 75, 90]:
+            raw_result.update(
+                {f"P{perc}": raw_result["percentiles"][str(perc)]})
+
+        # add the result to raw_result
+        latency_results.append(raw_result)
+        continue
+
+    elif "throughput" in f.name:
+        # this result is generated via `benchmark_throughput.py`
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands"), "r") as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        throughput_results.append(raw_result)
+        continue
+
+    print(f"Skipping {test_file}")
+
+latency_results = pd.DataFrame.from_dict(latency_results)
+serving_results = pd.DataFrame.from_dict(serving_results)
+throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+# remapping the key, for visualization purpose
+if not latency_results.empty:
+    latency_results = latency_results[list(
+        latency_column_mapping.keys())].rename(columns=latency_column_mapping)
+if not serving_results.empty:
+    serving_results = serving_results[list(
+        serving_column_mapping.keys())].rename(columns=serving_column_mapping)
+if not throughput_results.empty:
+    throughput_results = throughput_results[list(
+        throughput_results_column_mapping.keys())].rename(
+            columns=throughput_results_column_mapping)
+
+# get markdown tables
+latency_md_table = tabulate(latency_results,
+                            headers='keys',
+                            tablefmt='pipe',
+                            showindex=False)
+serving_md_table = tabulate(serving_results,
+                            headers='keys',
+                            tablefmt='pipe',
+                            showindex=False)
+throughput_md_table = tabulate(throughput_results,
+                               headers='keys',
+                               tablefmt='pipe',
+                               showindex=False)
+
+# document the result
+with open(results_folder / "benchmark_results.md", "w") as f:
+    if not latency_results.empty:
+        f.write("## Latency tests\n")
+        f.write(latency_md_table)
+        f.write("\n")
+    if not throughput_results.empty:
+        f.write("## Throughput tests\n")
+        f.write(throughput_md_table)
+        f.write("\n")
+    if not serving_results.empty:
+        f.write("## Serving tests\n")
+        f.write(serving_md_table)
+        f.write("\n")
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
new file mode 100644
index 000000000..c785e6a0d
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+
+retries=0
+while [ $retries -lt 1000 ]; do
+    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+        exit 0
+    fi
+
+    echo "Waiting for image to be available..."
+
+    retries=$((retries + 1))
+    sleep 5
+done
+
+exit 1
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/serving-tests.json b/.buildkite/nightly-benchmarks/serving-tests.json
new file mode 100644
index 000000000..bb6746612
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/serving-tests.json
@@ -0,0 +1,59 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/throughput-tests.json b/.buildkite/nightly-benchmarks/throughput-tests.json
new file mode 100644
index 000000000..db4f908d7
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/throughput-tests.json
@@ -0,0 +1,35 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 17edb7515..9937f8333 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -10,6 +10,7 @@ import torch
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
@@ -37,6 +38,7 @@ def main(args: argparse.Namespace):
               download_dir=args.download_dir,
               block_size=args.block_size,
               gpu_memory_utilization=args.gpu_memory_utilization,
+              load_format=args.load_format,
               distributed_executor_backend=args.distributed_executor_backend)
 
     sampling_params = SamplingParams(
@@ -222,6 +224,29 @@ if __name__ == '__main__':
                         help='the fraction of GPU memory to be used for '
                         'the model executor, which can range from 0 to 1.'
                         'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--load-format',
+        type=str,
+        default=EngineArgs.load_format,
+        choices=[
+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
+            'bitsandbytes'
+        ],
+        help='The format of the model weights to load.\n\n'
+        '* "auto" will try to load the weights in the safetensors format '
+        'and fall back to the pytorch bin format if safetensors format '
+        'is not available.\n'
+        '* "pt" will load the weights in the pytorch bin format.\n'
+        '* "safetensors" will load the weights in the safetensors format.\n'
+        '* "npcache" will load the weights in pytorch format and store '
+        'a numpy cache to speed up the loading.\n'
+        '* "dummy" will initialize the weights with random values, '
+        'which is mainly for profiling.\n'
+        '* "tensorizer" will load the weights using tensorizer from '
+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
+        'section for more information.\n'
+        '* "bitsandbytes" will load the weights using bitsandbytes '
+        'quantization.\n')
     parser.add_argument(
         '--distributed-executor-backend',
         choices=['ray', 'mp'],
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 4112a3272..df32b366c 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -499,6 +499,8 @@ def main(args: argparse.Namespace):
         # Save to file
         base_model_id = model_id.split("/")[-1]
         file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        if args.result_filename:
+            file_name = args.result_filename
         if args.result_dir:
             file_name = os.path.join(args.result_dir, file_name)
         with open(file_name, "w") as outfile:
@@ -639,6 +641,15 @@ if __name__ == "__main__":
         help="Specify directory to save benchmark json results."
         "If not specified, results are saved in the current directory.",
     )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
 
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 07b2f8541..463d9973d 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -10,6 +10,7 @@ from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
 
+from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
 
@@ -81,6 +82,7 @@ def run_vllm(
     distributed_executor_backend: Optional[str],
     gpu_memory_utilization: float = 0.9,
     download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -102,6 +104,7 @@ def run_vllm(
         enable_chunked_prefill=enable_chunked_prefill,
         max_num_batched_tokens=max_num_batched_tokens,
         distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
     )
 
     # Add the requests to the engine.
@@ -228,7 +231,7 @@ def main(args: argparse.Namespace):
             args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
             args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.download_dir)
+            args.gpu_memory_utilization, args.download_dir, args.load_format)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -377,6 +380,29 @@ if __name__ == "__main__":
         help='Backend to use for distributed serving. When more than 1 GPU '
         'is used, will be automatically set to "ray" if installed '
         'or "mp" (multiprocessing) otherwise.')
+    parser.add_argument(
+        '--load-format',
+        type=str,
+        default=EngineArgs.load_format,
+        choices=[
+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
+            'bitsandbytes'
+        ],
+        help='The format of the model weights to load.\n\n'
+        '* "auto" will try to load the weights in the safetensors format '
+        'and fall back to the pytorch bin format if safetensors format '
+        'is not available.\n'
+        '* "pt" will load the weights in the pytorch bin format.\n'
+        '* "safetensors" will load the weights in the safetensors format.\n'
+        '* "npcache" will load the weights in pytorch format and store '
+        'a numpy cache to speed up the loading.\n'
+        '* "dummy" will initialize the weights with random values, '
+        'which is mainly for profiling.\n'
+        '* "tensorizer" will load the weights using tensorizer from '
+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
+        'section for more information.\n'
+        '* "bitsandbytes" will load the weights using bitsandbytes '
+        'quantization.\n')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
-- 
GitLab


From d47af2bc0208d50ed36ae877876c1d2eafdc933a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Jun 2024 00:27:30 +0800
Subject: [PATCH 038/376] [CI/Build] Disable LLaVA-NeXT CPU test (#5529)

---
 .buildkite/run-cpu-test.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 6a86bc0eb..5f9ca5d75 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -19,6 +19,5 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
-  bash ../.buildkite/download-images.sh
   cd ../
-  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models -m \"not llava\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
-- 
GitLab


From 703475f6c2771600acc27eba76f6a750f54aae50 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 14 Jun 2024 12:30:15 -0400
Subject: [PATCH 039/376] [Kernel] Fix CUTLASS 3.x custom broadcast load
 epilogue (#5516)

---
 .../quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp | 2 +-
 vllm/model_executor/layers/quantization/fp8.py                | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
index 8f38bbf50..877a9f5b9 100644
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
@@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast {
 
     CUTLASS_DEVICE void
     begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
-      if (params.ptr_row == nullptr) {
+      if (!params.row_broadcast) {
         return;
       }
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bc08bfcc3..e89fd6581 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -257,9 +257,7 @@ class Fp8LinearMethod(LinearMethodBase):
         #   If dynamic, layer.input_scale is None and x_scale computed from x.
         #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
-        # Temporarily disable CUTLASS kernels due to an illegal memory access
-        #if  bias is None and self.cutlass_fp8_supported:
-        if False:
+        if bias is None and self.cutlass_fp8_supported:
             qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
 
             # Fused GEMM_DQ
-- 
GitLab


From d74674bbd978fad7f27a252650249bc2550f3e92 Mon Sep 17 00:00:00 2001
From: "Allen.Dou" <allen.dou@hotmail.com>
Date: Sat, 15 Jun 2024 00:47:44 +0800
Subject: [PATCH 040/376] [Misc] Fix arg names (#5524)

---
 benchmarks/kernels/benchmark_paged_attention.py | 2 +-
 examples/aqlm_example.py                        | 2 +-
 examples/fp8/extract_scales.py                  | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index e6f4e9e6b..a5355f4c1 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -165,7 +165,7 @@ if __name__ == '__main__':
                         choices=["v1", "v2"],
                         default="v2")
     parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument("--seq_len", type=int, default=4096)
+    parser.add_argument("--seq-len", type=int, default=4096)
     parser.add_argument("--num-query-heads", type=int, default=64)
     parser.add_argument("--num-kv-heads", type=int, default=8)
     parser.add_argument("--head-size",
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index e7c17fa03..3a63003ab 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -17,7 +17,7 @@ def main():
                         type=int,
                         default=0,
                         help='known good models by index, [0-4]')
-    parser.add_argument('--tensor_parallel_size',
+    parser.add_argument('--tensor-parallel-size',
                         '-t',
                         type=int,
                         default=1,
diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py
index 1eb961a5a..e007a3bc0 100644
--- a/examples/fp8/extract_scales.py
+++ b/examples/fp8/extract_scales.py
@@ -327,7 +327,7 @@ if __name__ == "__main__":
         "--quantization-param-path <filename>). This is only used "
         "if the KV cache dtype is FP8 and on ROCm (AMD GPU).")
     parser.add_argument(
-        "--quantized_model",
+        "--quantized-model",
         help="Specify the directory containing a single quantized HF model. "
         "It is expected that the quantization format is FP8_E4M3, for use "
         "on ROCm (AMD GPU).",
@@ -339,18 +339,18 @@ if __name__ == "__main__":
         choices=["auto", "safetensors", "npz", "pt"],
         default="auto")
     parser.add_argument(
-        "--output_dir",
+        "--output-dir",
         help="Optionally specify the output directory. By default the "
         "KV cache scaling factors will be saved in the model directory, "
         "however you can override this behavior here.",
         default=None)
     parser.add_argument(
-        "--output_name",
+        "--output-name",
         help="Optionally specify the output filename.",
         # TODO: Change this once additional scaling factors are enabled
         default="kv_cache_scales.json")
     parser.add_argument(
-        "--tp_size",
+        "--tp-size",
         help="Optionally specify the tensor-parallel (TP) size that the "
         "quantized model should correspond to. If specified, during KV "
         "cache scaling factor extraction the observed TP size will be "
-- 
GitLab


From 15985680e2278610e873cc07ec72fa514ace72e9 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 14 Jun 2024 13:01:46 -0400
Subject: [PATCH 041/376] [ Misc ] Rs/compressed tensors cleanup (#5432)

Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../compressed_tensors/compressed_tensors.py  |  2 +-
 .../schemes/compressed_tensors_w4a16.py       | 21 +++++++++++--------
 .../compressed_tensors_w8a8_dynamictoken.py   | 18 +++++++---------
 .../compressed_tensors_w8a8_statictensor.py   | 16 --------------
 4 files changed, 21 insertions(+), 36 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index c7f047845..e134a26ef 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -26,7 +26,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         return []
 
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.float16]
+        return [torch.float16, torch.bfloat16]
 
     # Need to figure it out
     def get_min_capability(self) -> int:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
index 90446a5ff..373458cff 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
@@ -64,10 +64,9 @@ class CompressedTensorsW4A16(CompressedTensorsScheme):
                 "input_dim": 1,
                 "output_dim": 0,
                 "packed_dim": 1,
-                "pack_factor": pack_factor
+                "pack_factor": pack_factor,
+                "weight_loader": weight_loader
             })
-        set_weight_attrs(weight, {"weight_loader": weight_loader})
-
         layer.register_parameter("weight_packed", weight)
 
         weight_scale = Parameter(
@@ -79,11 +78,12 @@ class CompressedTensorsW4A16(CompressedTensorsScheme):
             requires_grad=False,
         )
 
-        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
-        set_weight_attrs(weight_scale, {
-            "input_dim": weight_scale_dim,
-            "output_dim": 0
-        })
+        set_weight_attrs(
+            weight_scale, {
+                "weight_loader": weight_loader,
+                "input_dim": weight_scale_dim,
+                "output_dim": 0
+            })
         layer.register_parameter("weight_scale", weight_scale)
 
         # A 2D array defining the original shape of the weights
@@ -92,7 +92,10 @@ class CompressedTensorsW4A16(CompressedTensorsScheme):
                                  requires_grad=False)
 
         layer.register_parameter("weight_shape", weight_shape)
-        set_weight_attrs(weight_shape, {"weight_loader": weight_loader})
+        set_weight_attrs(weight_shape, {
+            "weight_loader": weight_loader,
+            "ignore_warning": True,
+        })
 
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index 9bb7bf447..d514d7b28 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -48,9 +48,6 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
         weight_scale_dim = sum(
             output_partition_sizes) if is_tensor_partitioned else 1
 
-        weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
-                                      requires_grad=False)
-
         weight_scale = Parameter(torch.empty(weight_scale_dim,
                                              dtype=torch.float32),
                                  requires_grad=False)
@@ -61,21 +58,22 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
                            requires_grad=False)
 
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        set_weight_attrs(weight, {"weight_loader": weight_loader})
-        set_weight_attrs(weight, {"logical_widths": output_partition_sizes})
+        set_weight_attrs(
+            weight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "weight_loader": weight_loader,
+                "logical_widths": output_partition_sizes
+            })
 
         layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
         set_weight_attrs(
             weight_scale, {
+                "weight_loader": weight_loader,
                 "shard_splitter": self.scales_shard_splitter,
                 "logical_widths": output_partition_sizes
             })
 
-        layer.register_parameter("weight_zero_point", weight_zero_point)
-        set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader})
-
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         weight = layer.weight
         weight_scale = layer.weight_scale
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
index 88c15c5c2..414e17a06 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -39,22 +39,16 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
-        # TODO: remove zero_point parameters once the configs given remove them
-
         is_tensor_partitioned = len(output_partition_sizes) != 1
         weight_scale_dim = sum(
             output_partition_sizes) if is_tensor_partitioned else 1
 
         input_scale = Parameter(torch.empty(1, dtype=torch.float32),
                                 requires_grad=False)
-        input_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
-                                     requires_grad=False)
 
         weight_scale = Parameter(torch.empty(weight_scale_dim,
                                              dtype=torch.float32),
                                  requires_grad=False)
-        weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
-                                      requires_grad=False)
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
@@ -72,11 +66,6 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
             "weight_loader": weight_loader,
             "ignore_warning": True,
         })
-        layer.register_parameter("input_zero_point", input_zero_point)
-        set_weight_attrs(input_zero_point, {
-            "weight_loader": weight_loader,
-            "ignore_warning": True,
-        })
         layer.register_parameter("weight_scale", weight_scale)
         set_weight_attrs(
             weight_scale, {
@@ -85,11 +74,6 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
                 "logical_widths": output_partition_sizes,
                 "ignore_warning": True,
             })
-        layer.register_parameter("weight_zero_point", weight_zero_point)
-        set_weight_attrs(weight_zero_point, {
-            "weight_loader": weight_loader,
-            "ignore_warning": True
-        })
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         weight = layer.weight
-- 
GitLab


From 348616ac4b72e2acc6e9a60ae94cf0f7fc29ac31 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 14 Jun 2024 13:02:00 -0400
Subject: [PATCH 042/376] [Kernel] Suppress mma.sp warning on CUDA 12.5 and
 later (#5401)

---
 csrc/quantization/marlin/sparse/common/mma.h | 74 +++++++++++---------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
index 45ab67a78..8a6c65338 100644
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -20,6 +20,19 @@
 
 namespace marlin_24 {
 
+// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
+// is not supported. On later versions of CUDA the version without ordered
+// metadata results in the following warning:
+//  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
+//  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
+//  | reduced performance on some future architectures
+#if defined CUDA_VERSION && CUDA_VERSION >= 12500
+  #define MMA_SP_INST \
+    "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#else
+  #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#endif
+
 // m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
 __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
@@ -29,41 +42,38 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
   const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
   const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
+
   float* c = reinterpret_cast<float*>(&frag_c);
   if (psel == 0) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
-        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-        "{%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]),
-          "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
-          "r"(e[0]));
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
-        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-        "{%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
-        : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]),
-          "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]),
-          "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
   } else {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
-        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-        "{%12,%13,%14,%15}, %16, 0x1;\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]),
-          "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
-          "r"(e[0]));
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
-        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-        "{%12,%13,%14,%15}, %16, 0x1;\n"
-        : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
-        : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]),
-          "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]),
-          "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
   }
 }
 
-- 
GitLab


From 48f589e18b8b6758dbfb6bb23b2994430893b477 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 14 Jun 2024 10:02:23 -0700
Subject: [PATCH 043/376] [mis] fix flaky test of
 test_cuda_device_count_stateless (#5546)

---
 tests/distributed/test_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index b7ec59c7a..923ad66c2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -23,7 +23,8 @@ def test_cuda_device_count_stateless():
     CUDA_VISIBLE_DEVICES is changed."""
 
     actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
-    assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1"
+    assert sorted(ray.get(
+        actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"]
     assert ray.get(actor.get_count.remote()) == 2
     ray.get(actor.set_cuda_visible_devices.remote("0"))
     assert ray.get(actor.get_count.remote()) == 1
-- 
GitLab


From 77490c6f2f1e99982d2553832a42980bbdee820c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Jun 2024 01:04:42 +0800
Subject: [PATCH 044/376] [Core] Remove duplicate processing in async engine
 (#5525)

---
 vllm/engine/async_llm_engine.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 943402c86..03b6d03a9 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -580,21 +580,9 @@ class AsyncLLMEngine:
         if arrival_time is None:
             arrival_time = time.time()
 
-        if self.engine_use_ray:
-            processed_inputs = await self.engine.process_model_inputs_async \
-                .remote(  # type: ignore
-                    request_id=request_id,
-                    inputs=inputs,
-                    lora_request=lora_request)
-        else:
-            processed_inputs = await self.engine.process_model_inputs_async(
-                request_id=request_id,
-                inputs=inputs,
-                lora_request=lora_request)
-
         stream = self._request_tracker.add_request(
             request_id,
-            inputs=processed_inputs,
+            inputs=inputs,
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
-- 
GitLab


From d1c3d7d1398c26fa5afd4583a58fceca76555c2a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 14 Jun 2024 10:59:28 -0700
Subject: [PATCH 045/376] [misc][distributed] fix benign error in
 `is_in_the_same_node` (#5512)

---
 vllm/distributed/parallel_state.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index f6a2fc9b0..16c5297af 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -23,8 +23,9 @@ import contextlib
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
-from multiprocessing import resource_tracker, shared_memory
+from multiprocessing import shared_memory
 from typing import Any, Dict, List, Optional, Tuple, Union
+from unittest.mock import patch
 
 import torch
 from torch.distributed import Backend, ProcessGroup
@@ -744,7 +745,12 @@ def is_in_the_same_node(pg: ProcessGroup):
                                                         src=ranks[0],
                                                         group=pg)
                 name = recv[0]
-                shm = shared_memory.SharedMemory(name=name)
+                # fix to https://stackoverflow.com/q/62748654/9191338
+                # Python incorrectly tracks shared memory even if it is not
+                # created by the process. The following patch is a workaround.
+                with patch("multiprocessing.resource_tracker.register",
+                           lambda *args, **kwargs: None):
+                    shm = shared_memory.SharedMemory(name=name)
                 if shm.buf[:len(magic_message)] == magic_message:
                     is_in_the_same_node[rank] = 1
     except Exception as e:
@@ -757,14 +763,8 @@ def is_in_the_same_node(pg: ProcessGroup):
 
     # clean up the shared memory segment
     with contextlib.suppress(OSError):
-        if rank == 0:
-            if shm:
-                shm.unlink()
-        else:
-            if shm:
-                # fix to https://stackoverflow.com/q/62748654/9191338
-                resource_tracker.unregister(
-                    shm._name, "shared_memory")  # type: ignore[attr-defined]
+        if rank == 0 and shm:
+            shm.unlink()
     torch.distributed.all_reduce(is_in_the_same_node, group=pg)
 
     return is_in_the_same_node.sum().item() == world_size
-- 
GitLab


From cdab68dcdb7a68b46b8138f73cdd6ac26ff6d9c0 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 14 Jun 2024 13:17:21 -0500
Subject: [PATCH 046/376] [Docs] Add ZhenFund as a Sponsor (#5548)

---
 README.md                         | 1 +
 docs/source/community/sponsors.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index ce2d8d5fd..c24768bf7 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Trainy
 - UC Berkeley
 - UC San Diego
+- ZhenFund
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
 
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index c8f2c16d3..cd8e8b0f5 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,5 +22,6 @@ vLLM is a community project. Our compute resources for development and testing a
 - Trainy
 - UC Berkeley
 - UC San Diego
+- ZhenFund
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
-- 
GitLab


From 6e2527a7cb94fa9154e34a42b95c1e4eb9a83e01 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Fri, 14 Jun 2024 14:27:57 -0400
Subject: [PATCH 047/376] [Doc] Update documentation on Tensorizer (#5471)

---
 docs/source/index.rst              |  1 +
 docs/source/serving/tensorizer.rst | 12 ++++++++++++
 vllm/engine/arg_utils.py           |  2 +-
 3 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/serving/tensorizer.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index b7c0d5b88..f5d862759 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -81,6 +81,7 @@ Documentation
    serving/env_vars
    serving/usage_stats
    serving/integrations
+   serving/tensorizer
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst
new file mode 100644
index 000000000..a44696507
--- /dev/null
+++ b/docs/source/serving/tensorizer.rst
@@ -0,0 +1,12 @@
+.. _tensorizer:
+
+Loading Models with CoreWeave's Tensorizer
+==========================================
+vLLM supports loading models with `CoreWeave's Tensorizer <https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer>`_.
+vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
+at runtime extremely quickly directly to the GPU, resulting in significantly
+shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
+
+For more information on CoreWeave's Tensorizer, please refer to
+`CoreWeave's Tensorizer documentation <https://github.com/coreweave/tensorizer>`_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
+the `vLLM example script <https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html>`_.
\ No newline at end of file
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 227de5475..ba53b5c86 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -230,7 +230,7 @@ class EngineArgs:
             '* "dummy" will initialize the weights with random values, '
             'which is mainly for profiling.\n'
             '* "tensorizer" will load the weights using tensorizer from '
-            'CoreWeave. See the Tensorize vLLM Model script in the Examples'
+            'CoreWeave. See the Tensorize vLLM Model script in the Examples '
             'section for more information.\n'
             '* "bitsandbytes" will load the weights using bitsandbytes '
             'quantization.\n')
-- 
GitLab


From e2afb03c92a06700d296a2e7f6565d4a4f05168c Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 14 Jun 2024 22:28:11 +0200
Subject: [PATCH 048/376] [Bugfix] Enable loading FP8 checkpoints for
 gpt_bigcode models  (#5460)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/models/gpt_bigcode.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 69b75763e..b15ed1198 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -299,4 +299,10 @@ class GPTBigCodeForCausalLM(nn.Module):
             param = params_dict[name]
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
-            weight_loader(param, loaded_weight)
+            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
+            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
+                weight_loader(param, loaded_weight, 'q')
+                weight_loader(param, loaded_weight, 'k')
+                weight_loader(param, loaded_weight, 'v')
+            else:
+                weight_loader(param, loaded_weight)
-- 
GitLab


From 28c145eb5755902505c066dc3b1e5315572cc6e7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 14 Jun 2024 14:40:09 -0700
Subject: [PATCH 049/376] [Bugfix] Fix typo in Pallas backend (#5558)

---
 vllm/attention/backends/pallas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 75f246526..b203c5ec5 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -110,7 +110,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
             raise NotImplementedError("TPU version must be 4 or higher.")
 
         self.megacore_mode = None
-        tpu_type = torch_xla.tpu.get_tp_groupu_env()["TYPE"].lower()
+        tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
         if not tpu_type.endswith("lite"):
             if self.num_kv_heads % 2 == 0:
                 self.megacore_mode = "kv_head"
-- 
GitLab


From f5bb85b435e6fe3db57fae1e25e09914015ef957 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 14 Jun 2024 14:47:45 -0700
Subject: [PATCH 050/376] [Core][Distributed] improve p2p cache generation
 (#5528)

---
 .../device_communicators/cuda_wrapper.py      | 146 ++++++++++++
 .../custom_all_reduce_utils.py                | 215 ++++++++++--------
 2 files changed, 265 insertions(+), 96 deletions(-)
 create mode 100644 vllm/distributed/device_communicators/cuda_wrapper.py

diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
new file mode 100644
index 000000000..24308235c
--- /dev/null
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -0,0 +1,146 @@
+"""This file is a pure Python wrapper for the cudart library.
+It avoids the need to compile a separate shared library, and is
+convenient for use when we just need to call a few functions.
+"""
+
+import ctypes
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+# this line makes it possible to directly load `libcudart.so` using `ctypes`
+import torch  # noqa
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# === export types and functions from cudart to Python ===
+# for the original cudart definition, please check
+# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
+
+cudaError_t = ctypes.c_int
+cudaMemcpyKind = ctypes.c_int
+
+
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: List[Any]
+
+
+class CudaRTLibrary:
+    exported_functions = [
+        # ​cudaError_t cudaSetDevice ( int  device )
+        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
+        # cudaError_t 	cudaDeviceSynchronize ( void )
+        Function("cudaDeviceSynchronize", cudaError_t, []),
+        # ​cudaError_t cudaDeviceReset ( void )
+        Function("cudaDeviceReset", cudaError_t, []),
+
+        # const char* 	cudaGetErrorString ( cudaError_t error )
+        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
+
+        # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
+        Function("cudaMalloc", cudaError_t,
+                 [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
+        # ​cudaError_t 	cudaFree ( void* devPtr )
+        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
+        # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
+        Function("cudaMemset", cudaError_t,
+                 [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
+        # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
+        Function("cudaMemcpy", cudaError_t, [
+            ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind
+        ]),
+
+        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
+        Function("cudaIpcGetMemHandle", cudaError_t,
+                 [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
+        # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
+        Function("cudaIpcOpenMemHandle", cudaError_t, [
+            ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint
+        ]),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: Dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+
+    def __init__(self, so_file: Optional[str] = None):
+        if so_file is None:
+            assert torch.version.cuda is not None
+            major_version = torch.version.cuda.split(".")[0]
+            so_file = f"libcudart.so.{major_version}"
+        if so_file not in CudaRTLibrary.path_to_library_cache:
+            lib = ctypes.CDLL(so_file)
+            CudaRTLibrary.path_to_library_cache[so_file] = lib
+        self.lib = CudaRTLibrary.path_to_library_cache[so_file]
+
+        if so_file not in CudaRTLibrary.path_to_dict_mapping:
+            _funcs = {}
+            for func in CudaRTLibrary.exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
+        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]
+
+    def CUDART_CHECK(self, result: cudaError_t) -> None:
+        if result != 0:
+            error_str = self.cudaGetErrorString(result)
+            raise RuntimeError(f"CUDART error: {error_str}")
+
+    def cudaGetErrorString(self, error: cudaError_t) -> str:
+        return self.funcs["cudaGetErrorString"](error).decode("utf-8")
+
+    def cudaSetDevice(self, device: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))
+
+    def cudaDeviceSynchronize(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())
+
+    def cudaDeviceReset(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())
+
+    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
+        return devPtr
+
+    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
+        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
+
+    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int,
+                   count: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
+
+    def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p,
+                   count: int) -> None:
+        cudaMemcpyDefault = 4
+        kind = cudaMemcpyDefault
+        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
+
+    def cudaIpcGetMemHandle(self,
+                            devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
+        handle = cudaIpcMemHandle_t()
+        self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"](
+            ctypes.byref(handle), devPtr))
+        return handle
+
+    def cudaIpcOpenMemHandle(self,
+                             handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
+        cudaIpcMemLazyEnablePeerAccess = 1
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"](
+            ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess))
+        return devPtr
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index c9573edb0..e6957b119 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,87 +1,98 @@
+import ctypes
 import json
 import os
-import sys
-import tempfile
-import time
-from contextlib import contextmanager
-from typing import Callable, Dict, List, Optional
+from itertools import product
+from typing import Dict, Optional, Sequence
 
-import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 
 import vllm.envs as envs
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.logger import init_logger
 from vllm.utils import cuda_device_count_stateless
 
 logger = init_logger(__name__)
 
 
-@contextmanager
-def mute_output():
-    with open(os.devnull, "w") as f:
-        sys.stderr = f
-        sys.stdout = f
-        yield
-
-
-def producer(i: int,
-             init_method: str,
+def producer(batch_src: Sequence[int],
+             producer_queue,
+             consumer_queue,
+             result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
         os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
-    with mute_output():
-        dist.init_process_group(
-            backend="gloo",
-            init_method=init_method,
-            world_size=2,
-            rank=0,
-        )
-        # produce a tensor in GPU i
-        data = torch.zeros((128, ), device=f"cuda:{i}")
-        # get the information to reconstruct the shared tensor
-        func, args = torch.multiprocessing.reductions.reduce_tensor(data)
-        args = list(args)
-        dist.broadcast_object_list([(func, args)], src=0)
-        dist.barrier()
-        torch.cuda.synchronize()
-        assert torch.all(data == 1).item()
-
-
-def consumer(j: int,
-             init_method: str,
+
+    lib = CudaRTLibrary()
+    for i in batch_src:
+        lib.cudaSetDevice(i)
+        pointer = lib.cudaMalloc(1024)
+        lib.cudaMemset(pointer, 1, 1024)
+        lib.cudaDeviceSynchronize()
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        producer_queue.put(handle)
+        open_success = consumer_queue.get()
+        if open_success:
+            # use two queues to simulate barrier
+            producer_queue.put(0)
+            consumer_queue.get()
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def consumer(batch_tgt: Sequence[int],
+             producer_queue,
+             consumer_queue,
+             result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
         os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
-    with mute_output():
-        dist.init_process_group(
-            backend="gloo",
-            init_method=init_method,
-            world_size=2,
-            rank=1,
-        )
-        torch.cuda.set_device(j)
-        recv = [None]
-        dist.broadcast_object_list(recv, src=0)
-        func: Callable
-        args: List
-        func, args = recv[0]  # type: ignore
-        # `args[6]` is the device id
-        # by default pytorch will use `i` from the producer
-        # here we need to set it to `j` to test P2P access
-        args[6] = j
-        data = func(*args)
-        data += 1
-        dist.barrier()
-        torch.cuda.synchronize()
-        assert torch.all(data == 1).item()
-
-
-def can_actually_p2p(i, j):
+
+    lib = CudaRTLibrary()
+    for j in batch_tgt:
+        lib.cudaSetDevice(j)
+        handle = producer_queue.get()
+        open_success = False
+        try:
+            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
+            open_success = True
+        except RuntimeError:
+            # cannot error out here, because the producer process
+            # is still waiting for the response.
+            pass
+        consumer_queue.put(open_success)
+        if open_success:
+            # modify the memory
+            lib.cudaMemset(pointer, 2, 1024)
+            # use two queues to simulate barrier
+            producer_queue.get()
+            consumer_queue.put(0)
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def can_actually_p2p(
+    batch_src: Sequence[int],
+    batch_tgt: Sequence[int],
+):
     """
     Usually, checking if P2P access is enabled can be done by
-    `torch.cuda.can_device_access_peer(i, j)`. However, sometimes
-    the driver might be broken, and `torch.cuda.can_device_access_peer(i, j)`
+    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
+    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
     returns `True` even if P2P access is not actually possible.
     See https://github.com/vllm-project/vllm/issues/2728 and
     https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
@@ -90,41 +101,50 @@ def can_actually_p2p(i, j):
 
     Note on p2p and cuda IPC:
     Usually, one process uses one GPU:
-    GPU i --> cuda context i --> tensor i --> process i
+    GPU src --> cuda context src --> tensor src --> process src
 
     We need to combine p2p and cuda IPC, so that:
-    GPU i --> cuda context i --> tensor i --> process i
-                                 |shared|
-    GPU j --> cuda context j --> tensor j --> process j
-    That is to say, process i creates a tensor in GPU i, passes IPC handle to
-    process j, and process j accesses the tensor in GPU j. Any operation on the
-    tensor in process j will be reflected in the tensor in process i, because
+    GPU src --> cuda context src --> tensor src --> process src
+                                      |shared|
+    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
+    That is to say, process src creates a tensor in GPU src, passes IPC handle to
+    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
+    tensor in process tgt will be reflected in the tensor in process src, because
     they are the same memory segment.
-    It is important to note that process j accesses the tensor in GPU j, not
-    GPU i. That's why we need p2p access. # noqa
-    """
+    It is important to note that process tgt accesses the tensor in GPU tgt, not
+    GPU src. That's why we need p2p access.
+
+    The most time-consuming part is the process creation. To avoid creating
+    processes for every pair of GPUs, we use batched testing. We create two
+    processes for testing all pairs of GPUs in batch. The trick is to reset
+    the device after each test (which is not available in PyTorch).
+    """  # noqa
     cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
     # pass the CUDA_VISIBLE_DEVICES to the child process
     # to make sure they see the same set of GPUs
 
-    # make sure the temp file is not the same across different calls
-    temp_path = tempfile.mktemp() + str(time.time())
-    # create an empty file
-    with open(temp_path, "w"):
-        pass
-    init_method = f"file://{temp_path}"
-
     # make sure the processes are spawned
     smp = mp.get_context("spawn")
-    pi = smp.Process(target=producer,
-                     args=(i, init_method, cuda_visible_devices))
-    pj = smp.Process(target=consumer,
-                     args=(j, init_method, cuda_visible_devices))
-    pi.start()
-    pj.start()
-    pi.join()
-    pj.join()
-    return pi.exitcode == 0 and pj.exitcode == 0
+    producer_queue = smp.Queue()
+    consumer_queue = smp.Queue()
+    result_queue = smp.Queue()
+    p_src = smp.Process(target=producer,
+                        args=(batch_src, producer_queue, consumer_queue,
+                              result_queue, cuda_visible_devices))
+    p_tgt = smp.Process(target=consumer,
+                        args=(batch_tgt, producer_queue, consumer_queue,
+                              result_queue, cuda_visible_devices))
+    p_src.start()
+    p_tgt.start()
+    p_src.join()
+    p_tgt.join()
+    result = []
+    for src, tgt in zip(batch_src, batch_tgt):
+        a = result_queue.get()
+        b = result_queue.get()
+        assert a == b
+        result.append(a)
+    return result
 
 
 # why do we need this cache?
@@ -142,14 +162,14 @@ def can_actually_p2p(i, j):
 _gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
 
 
-def gpu_p2p_access_check(i: int, j: int) -> bool:
-    """Check if GPU i can access GPU j."""
+def gpu_p2p_access_check(src: int, tgt: int) -> bool:
+    """Check if GPU src can access GPU tgt."""
 
     # if the cache variable is already calculated,
     # read from the cache instead of checking it again
     global _gpu_p2p_access_cache
     if _gpu_p2p_access_cache is not None:
-        return _gpu_p2p_access_cache[f"{i}->{j}"]
+        return _gpu_p2p_access_cache[f"{src}->{tgt}"]
 
     is_distributed = dist.is_initialized()
 
@@ -169,9 +189,12 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
         #  enter this block to calculate the cache
         logger.info("generating GPU P2P access cache in %s", path)
         cache = {}
-        for _i in range(num_dev):
-            for _j in range(num_dev):
-                cache[f"{_i}->{_j}"] = can_actually_p2p(_i, _j)
+        ids = list(range(num_dev))
+        # batch of all pairs of GPUs
+        batch_src, batch_tgt = zip(*list(product(ids, ids)))
+        result = can_actually_p2p(batch_src, batch_tgt)
+        for _i, _j, r in zip(batch_src, batch_tgt, result):
+            cache[f"{_i}->{_j}"] = r
         with open(path, "w") as f:
             json.dump(cache, f, indent=4)
     if is_distributed:
@@ -180,7 +203,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
     with open(path, "r") as f:
         cache = json.load(f)
     _gpu_p2p_access_cache = cache
-    return _gpu_p2p_access_cache[f"{i}->{j}"]
+    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
 
 
 __all__ = ["gpu_p2p_access_check"]
-- 
GitLab


From bd7efe95d03773c65fa7dc1e122f3ce0e079a542 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 14 Jun 2024 19:18:22 -0500
Subject: [PATCH 051/376] Add ccache to amd (#5555)

---
 .buildkite/test-template-aws.j2 | 1 +
 Dockerfile.rocm                 | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 09649b625..01f7ff1e0 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -30,6 +30,7 @@ steps:
         command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
         env:
           DOCKER_BUILDKIT: "1"
+        priority: 100
         soft_fail: true
     {% endif %}
     {% endfor %}
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 954958df8..724fa1673 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -42,6 +42,7 @@ RUN apt-get update && apt-get install -y \
     unzip \
     nvidia-cuda-toolkit \
     tmux \
+    ccache \
  && rm -rf /var/lib/apt/lists/*
 
 ### Mount Point ###
@@ -102,7 +103,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 
 ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
 
-RUN --mount=type=cache,target=/root/.cache/pip \
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
-- 
GitLab


From 1b8a0d71cf5aa1a43c14478ec90538c3fbe1b315 Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Sat, 15 Jun 2024 08:23:56 +0800
Subject: [PATCH 052/376] [Core][Bugfix]: fix prefix caching for blockv2
 (#5364)

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 tests/core/block/e2e/test_correctness.py | 67 ++++++++++++++++++++++++
 vllm/core/block/prefix_caching_block.py  |  7 ++-
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index ad253635e..8502eab0f 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -477,3 +477,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # we keep the blocks small, so that hit eviction quickly
+        "max_model_len": 48,
+        "block_size": 16,
+        "num_gpu_blocks_override": 3,
+
+        # Test APC in v2 block
+        "use_v2_block_manager": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "enable_prefix_caching": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+}])
+@pytest.mark.parametrize("seed", [1])
+def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
+                                                 test_llm_generator):
+    """Verify block manager v2 with auto prefix caching could works normal
+    even when eviction started.
+    With APC enabled, all blocks are held by native block at the beginning.
+    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    block, then it could be reused, or we need to recompute its kv cache.
+    """
+    output_len = 10
+    temperature = 0.0
+
+    prompts = [
+        "You are a helpful assistant. Please answer truthfully and write "
+        "out your thinking step by step to be sure you get the right answer. "
+        "If you make a mistake, attempt to correct it. who are you?",
+        "You are a helpful assistant. Please answer truthfully and write out "
+        "your thinking step by step to be sure you get the right answer. You "
+        "are helpful and harmless and you follow ethical guidelines. "
+        "who are you?"
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with APC disabled')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with APC enabled')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 405e97056..88dbbfb2f 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -176,14 +176,17 @@ class PrefixCachingBlockAllocator(BlockAllocator):
 
             self._refcounter.incr(block_id)
 
-            # the block comes from evictor already contain computed result
+            # Now this block is pop from evictor and ready to write
+            # with new content which most probably different with
+            # original content. So need to tell worker to recompute
+            # its kvcache
             block = self._create_block(
                 prev_block=prev_block,
                 token_ids=[],
                 block_size=self._block_size,
                 allocator=self,
                 block_id=block_id,
-                computed=True,
+                computed=False,
             )
             assert block.content_hash is None
 
-- 
GitLab


From 0e9164b40abdb30f1929edb44b56894c9e26c31d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Jun 2024 12:45:31 +0800
Subject: [PATCH 053/376] [mypy] Enable type checking for test directory
 (#5017)

---
 .github/workflows/mypy.yaml                   |  2 +-
 benchmarks/benchmark_serving.py               | 18 +++----
 benchmarks/benchmark_throughput.py            |  4 +-
 benchmarks/kernels/benchmark_aqlm.py          | 10 ++--
 benchmarks/kernels/benchmark_marlin.py        |  8 +--
 benchmarks/kernels/benchmark_moe.py           | 26 +++++++---
 .../kernels/benchmark_paged_attention.py      | 11 ++--
 benchmarks/kernels/benchmark_rope.py          |  7 +--
 examples/fp8/extract_scales.py                | 12 ++---
 examples/offline_inference_distributed.py     |  8 +--
 format.sh                                     |  2 +-
 tests/core/block/test_block_table.py          |  8 +--
 tests/core/block/test_prefix_caching_block.py |  4 +-
 tests/core/test_chunked_prefill_scheduler.py  | 10 ++--
 tests/core/test_scheduler.py                  | 52 +++++++++----------
 tests/core/utils.py                           | 12 +++--
 tests/distributed/test_pynccl.py              |  5 +-
 tests/distributed/test_utils.py               |  5 +-
 tests/entrypoints/test_openai_server.py       |  5 +-
 tests/kernels/test_attention.py               | 33 ++++++------
 tests/kernels/test_blocksparse_attention.py   | 22 ++++----
 tests/kernels/test_cache.py                   | 32 ++++++------
 tests/kernels/test_cutlass.py                 |  4 +-
 tests/kernels/test_flash_attn.py              |  4 +-
 tests/kernels/test_pos_encoding.py            | 28 +++++-----
 tests/lora/conftest.py                        | 21 ++++++--
 tests/lora/data/long_context_test_data.py     | 24 ++++++++-
 tests/lora/test_baichuan.py                   |  6 ++-
 tests/lora/test_chatglm3.py                   |  6 ++-
 tests/lora/test_gemma.py                      |  6 ++-
 tests/lora/test_layer_variation.py            |  6 +--
 tests/lora/test_layers.py                     | 23 ++++----
 tests/lora/test_llama.py                      |  6 ++-
 tests/lora/test_long_context.py               | 15 +++---
 tests/lora/test_lora_checkpoints.py           |  4 +-
 tests/lora/test_lora_manager.py               |  6 +--
 tests/lora/test_mixtral.py                    |  6 ++-
 tests/lora/test_phi.py                        |  6 ++-
 tests/lora/test_quant_model.py                |  7 ++-
 tests/lora/utils.py                           | 18 +++----
 tests/models/test_fp8.py                      |  3 +-
 tests/prefix_caching/test_prefix_caching.py   |  5 +-
 tests/quantization/test_configs.py            |  3 +-
 tests/samplers/test_logprobs.py               | 11 ++--
 tests/samplers/test_rejection_sampler.py      |  4 +-
 tests/samplers/test_sampler.py                | 41 ++++++++-------
 tests/spec_decode/e2e/conftest.py             | 13 ++---
 tests/spec_decode/test_batch_expansion.py     |  6 ++-
 tests/spec_decode/test_multi_step_worker.py   | 19 ++++---
 tests/spec_decode/test_spec_decode_worker.py  | 17 ++++--
 tests/spec_decode/utils.py                    | 14 +++--
 tests/test_cache_block_hashing.py             |  2 +-
 tests/test_logger.py                          |  1 +
 tests/tokenization/test_detokenize.py         |  4 +-
 tests/utils.py                                |  2 +-
 tests/worker/test_model_runner.py             | 23 ++++----
 vllm/attention/backends/torch_sdpa.py         |  4 +-
 vllm/attention/backends/xformers.py           |  4 +-
 vllm/core/block/block_table.py                |  2 +-
 vllm/core/block/naive_block.py                |  2 +-
 vllm/core/block/prefix_caching_block.py       |  2 +-
 vllm/core/block_manager_v2.py                 |  2 +-
 .../custom_all_reduce_utils.py                |  8 +--
 .../device_communicators/pynccl_wrapper.py    |  2 +-
 vllm/engine/llm_engine.py                     |  4 +-
 vllm/engine/metrics.py                        |  4 +-
 vllm/engine/output_processor/single_step.py   |  6 +--
 vllm/entrypoints/openai/run_batch.py          |  3 +-
 vllm/entrypoints/openai/serving_chat.py       |  2 +-
 vllm/entrypoints/openai/serving_embedding.py  |  2 +-
 vllm/lora/lora.py                             |  3 +-
 vllm/lora/worker_manager.py                   |  2 +-
 vllm/model_executor/layers/linear.py          |  2 +-
 .../layers/quantization/gptq_marlin.py        | 11 ++--
 .../quantization/utils/marlin_24_perms.py     | 18 ++++---
 .../layers/quantization/utils/marlin_perms.py | 18 ++++---
 vllm/model_executor/layers/sampler.py         | 25 +++++----
 vllm/model_executor/model_loader/loader.py    |  7 +--
 .../model_loader/weight_utils.py              |  2 +-
 vllm/model_executor/models/__init__.py        |  4 +-
 vllm/model_executor/models/arctic.py          |  4 +-
 vllm/model_executor/models/commandr.py        |  4 +-
 vllm/model_executor/models/gemma.py           |  4 +-
 vllm/sequence.py                              |  2 +-
 vllm/spec_decode/multi_step_worker.py         | 10 ++--
 vllm/spec_decode/ngram_worker.py              |  6 +--
 vllm/spec_decode/spec_decode_worker.py        |  8 +--
 vllm/spec_decode/util.py                      |  4 +-
 vllm/transformers_utils/detokenizer.py        |  2 +-
 vllm/utils.py                                 | 38 ++++++++------
 vllm/worker/model_runner.py                   |  4 +-
 vllm/worker/worker_base.py                    |  4 +-
 92 files changed, 510 insertions(+), 379 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 22e6c2ef0..62f0dbcd9 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -47,5 +47,5 @@ jobs:
         mypy vllm/model_executor  --config-file pyproject.toml
         mypy vllm/lora --config-file pyproject.toml
         mypy vllm/logging --config-file pyproject.toml
-        mypy vllm/model_executor --config-file pyproject.toml
+        mypy tests --config-file pyproject.toml
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index df32b366c..c136ee572 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -31,7 +31,7 @@ import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
 
 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -200,12 +200,12 @@ def calculate_metrics(
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens = []
+    actual_output_lens: List[int] = []
     total_input = 0
     completed = 0
-    itls = []
-    tpots = []
-    ttfts = []
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all
@@ -265,7 +265,7 @@ async def benchmark(
     disable_tqdm: bool,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+        request_func = ASYNC_REQUEST_FUNCS[backend]
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
@@ -292,7 +292,7 @@ async def benchmark(
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
     benchmark_start_time = time.perf_counter()
-    tasks = []
+    tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
         request_func_input = RequestFuncInput(
@@ -310,7 +310,7 @@ async def benchmark(
                              pbar=pbar)))
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    if not disable_tqdm:
+    if pbar is not None:
         pbar.close()
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
@@ -466,7 +466,7 @@ def main(args: argparse.Namespace):
 
     # Save config and results to json
     if args.save_result:
-        result_json = {}
+        result_json: Dict[str, Any] = {}
 
         # Setup
         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 463d9973d..48dfce428 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -108,8 +108,8 @@ def run_vllm(
     )
 
     # Add the requests to the engine.
-    prompts = []
-    sampling_params = []
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
     for prompt, _, output_len in requests:
         prompts.append(prompt)
         sampling_params.append(
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 59392947b..ac6a9f297 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -86,9 +86,9 @@ def dequant_no_scale(
 # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
 # the generic pytorch version.
 # Just visual comparison.
-def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
 
-    n = parts.sum().item()
+    n = int(parts.sum().item())
 
     device = torch.device('cuda:0')
 
@@ -204,7 +204,7 @@ def main():
         sys.stdout = sys.__stdout__
 
 
-def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
              methods):
 
     # I didn't see visible improvements from increasing these, but feel free :)
@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
     print('')
 
 
-def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
                nbooks: int, bits: int, method) -> float:
 
-    n = parts.sum().item()
+    n = int(parts.sum().item())
 
     device = torch.device('cuda:0')
 
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index b77191178..96f01967b 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,4 +1,5 @@
 import argparse
+from typing import List
 
 import torch
 import torch.utils.benchmark as benchmark
@@ -23,8 +24,9 @@ ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
 
 
-def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
-              size_m, size_k, size_n):
+def bench_run(results: List[benchmark.Measurement], model: str,
+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
+              size_m: int, size_k: int, size_n: int):
     label = "Quant Matmul"
 
     sub_label = ("{}, act={} k_full={}, b={}, g={}, "
@@ -156,7 +158,7 @@ def main(args):
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
 
-    results = []
+    results: List[benchmark.Measurement] = []
 
     for model in args.models:
         for layer in WEIGHT_SHAPES[model]:
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index be5dd32bd..62347aaf8 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,7 +1,7 @@
 import argparse
 import time
 from datetime import datetime
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, TypedDict
 
 import ray
 import torch
@@ -12,8 +12,17 @@ from transformers import AutoConfig
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 
 
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
 def benchmark_config(
-    config: Dict[str, int],
+    config: BenchmarkConfig,
     num_tokens: int,
     num_experts: int,
     shard_intermediate_size: int,
@@ -92,7 +101,7 @@ def benchmark_config(
     start_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
 
-    latencies = []
+    latencies: List[float] = []
     for i in range(num_iters):
         prepare(i)
         torch.cuda.synchronize()
@@ -111,7 +120,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
     # Reduced search space for faster tuning.
     # TODO(woosuk): Increase the search space and use a performance model to
     # prune the search space.
-    configs = []
+    configs: List[BenchmarkConfig] = []
     for num_stages in [2, 3, 4, 5]:
         for block_m in [16, 32, 64, 128, 256]:
             for block_k in [64, 128, 256]:
@@ -175,8 +184,8 @@ class BenchmarkWorker:
         topk: int,
         dtype: torch.dtype,
         use_fp8: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
+        search_space: List[BenchmarkConfig],
+    ) -> BenchmarkConfig:
         best_config = None
         best_time = float("inf")
         for config in tqdm(search_space):
@@ -199,10 +208,11 @@ class BenchmarkWorker:
                 best_config = config
         now = datetime.now()
         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
         return best_config
 
 
-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     return {
         "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
         "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
@@ -214,7 +224,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
 
 
 def save_configs(
-    configs: Dict[int, Dict[str, int]],
+    configs: Dict[int, BenchmarkConfig],
     num_experts: int,
     shard_intermediate_size: int,
     hidden_size: int,
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index a5355f4c1..687e2369b 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,7 +1,7 @@
 import argparse
 import random
 import time
-from typing import Optional
+from typing import List, Optional
 
 import torch
 
@@ -54,14 +54,17 @@ def main(
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
+    block_tables_lst: List[List[int]] = []
     for _ in range(num_seqs):
         block_table = [
             random.randint(0, NUM_BLOCKS - 1)
             for _ in range(max_num_blocks_per_seq)
         ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst,
+                                dtype=torch.int,
+                                device=device)
 
     # Create the KV cache.
     key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 00e55f606..a53c6c77a 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,11 +1,12 @@
 import argparse
 from itertools import accumulate
-from typing import Optional
+from typing import List, Optional
 
 import nvtx
 import torch
 
-from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
+                                                         get_rope)
 
 
 def benchmark_rope_kernels_multi_lora(
@@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
                             })
     # non-batched RoPE takes only one scaling factor, we create multiple
     # instances to simulate the same behavior
-    non_batched_ropes = []
+    non_batched_ropes: List[RotaryEmbedding] = []
     for scaling_factor in scaling_factors:
         non_batched_ropes.append(
             get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py
index e007a3bc0..1dce9d7e9 100644
--- a/examples/fp8/extract_scales.py
+++ b/examples/fp8/extract_scales.py
@@ -2,7 +2,7 @@ import argparse
 import glob
 import json
 import os
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -19,7 +19,7 @@ def _prepare_hf_weights(
     quantized_model_dir: str,
     load_format: str = "auto",
     fall_back_to_pt: bool = True,
-) -> Tuple[str, List[str], bool]:
+) -> Tuple[List[str], bool]:
     if not os.path.isdir(quantized_model_dir):
         raise FileNotFoundError(
             f"The quantized model directory `{quantized_model_dir}` "
@@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str,
 
 
 def _kv_scales_extractor(
-        hf_tensor_files: Iterable[str],
+        hf_tensor_files: List[str],
         use_safetensors: bool,
         rank_keyword: str = "rank",
         expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
@@ -115,7 +115,7 @@ def _kv_scales_extractor(
     for char in rank_keyword:
         assert not char.isdecimal(
         ), f"Rank keyword {rank_keyword} contains a numeric character!"
-    rank_scales_map = {}
+    rank_scales_map: Dict[int, Dict[int, float]] = {}
     for tensor_file in hf_tensor_files:
         try:
             rank_idx = tensor_file.find(rank_keyword)
@@ -141,7 +141,7 @@ def _kv_scales_extractor(
             raise
 
         if rank not in rank_scales_map:
-            layer_scales_map = {}
+            layer_scales_map: Dict[int, float] = {}
             rank_scales_map[rank] = layer_scales_map
         else:
             raise RuntimeError(
@@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str,
             "does not exist.")
     metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))
 
-    result = {}
+    result: Dict[str, Any] = {}
     for file in metadata_files:
         with open(file) as f:
             try:
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py
index 1e59e8950..677127844 100644
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@@ -5,7 +5,7 @@ distributively on a multi-nodes cluster.
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """
 
-from typing import Dict
+from typing import Any, Dict, List
 
 import numpy as np
 import ray
@@ -40,8 +40,8 @@ class LLMPredictor:
         # The output is a list of RequestOutput objects that contain the prompt,
         # generated text, and other information.
         outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt = []
-        generated_text = []
+        prompt: List[str] = []
+        generated_text: List[str] = []
         for output in outputs:
             prompt.append(output.prompt)
             generated_text.append(' '.join([o.text for o in output.outputs]))
@@ -71,7 +71,7 @@ def scheduling_strategy_fn():
         pg, placement_group_capture_child_tasks=True))
 
 
-resources_kwarg = {}
+resources_kwarg: Dict[str, Any] = {}
 if tensor_parallel_size == 1:
     # For tensor_parallel_size == 1, we simply set num_gpus=1.
     resources_kwarg["num_gpus"] = 1
diff --git a/format.sh b/format.sh
index 2fd6af03b..8c54b5630 100755
--- a/format.sh
+++ b/format.sh
@@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/model_executor  --config-file pyproject.toml
 mypy vllm/lora --config-file pyproject.toml
 mypy vllm/logging --config-file pyproject.toml
-mypy vllm/model_executor --config-file pyproject.toml
+mypy tests --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index 6fb95cfdf..496774c8d 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 
 from vllm.core.block.block_table import BlockTable
@@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
     token_ids = list(range(sequence_len))
     num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
 
-    block_tables = []
+    block_tables: List[BlockTable] = []
     for i in range(5):
         assert allocator.get_num_free_blocks(
             device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
@@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
     num_immutable_blocks_per_alloc = len(
         chunked_tokens) - num_mutable_blocks_per_alloc
 
-    block_tables = []
+    block_tables: List[BlockTable] = []
     for alloc_i in range(1, 6):
 
         block_tables.append(
@@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
     )
     block_table.allocate(token_ids=token_ids, device=Device.GPU)
 
-    appended_so_far = []
+    appended_so_far: List[int] = []
     for append in chunk_list(token_ids_to_append, append_size):
         block_table.append_token_ids(append)
         appended_so_far.extend(append)
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index bcf08cda0..fcf32cbe9 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -123,7 +123,7 @@ class TestPrefixCachingBlock:
                      num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
-        blocks = []
+        blocks: List[PrefixCachingBlock] = []
         num_blocks = math.ceil(
             len(token_ids) / block_size) + num_empty_trailing_blocks
 
@@ -608,7 +608,7 @@ class TestPrefixCachingBlockAllocator:
     ) -> List[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
-        blocks = []
+        blocks: List[Block] = []
         num_blocks = math.ceil(len(token_ids) / block_size)
 
         if num_blocks == 0:
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index f68482cc0..a3b76327e 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -483,11 +483,11 @@ def test_chunked_prefill_preempt():
     # The request should be preempted.
     scheduler.block_manager.can_append_slots = MagicMock()
 
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+    def cannot_append_second_group1(seq_group, num_lookahead_slots):
         return seq_group.request_id != "1"
 
     scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+        cannot_append_second_group1)
 
     # The running prefill is now preempted.
     _, out = schedule_and_update_computed_tokens(scheduler)
@@ -505,11 +505,11 @@ def test_chunked_prefill_preempt():
     assert seq_group.get_num_uncomputed_tokens() == 30
 
     # We should be able to run prefill twice as it is chunked.
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+    def cannot_append_second_group2(seq_group, num_lookahead_slots):
         return True
 
     scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+        cannot_append_second_group2)
     _, out = schedule_and_update_computed_tokens(scheduler)
     assert len(out.scheduled_seq_groups) == 1
     assert out.num_prefill_groups == 1
@@ -530,7 +530,7 @@ def test_chunked_prefill_max_seqs():
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running = []
+    running: List[SequenceGroup] = []
 
     _, seq_group = create_dummy_prompt("1", prompt_length=65)
     scheduler.add_seq_group(seq_group)
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 07fc8731e..bae958211 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,6 +1,6 @@
 import time
 from collections import deque
-from typing import List
+from typing import Deque, List, Set, Tuple
 from unittest.mock import MagicMock
 
 import pytest  # noqa
@@ -65,7 +65,7 @@ def test_scheduler_abort_seq_group():
 
     # Add multiple seq groups to scheduler.
     num_seq_group = 4
-    request_ids = set()
+    request_ids: Set[str] = set()
     for i in range(num_seq_group):
         _, seq_group = create_dummy_prompt(str(i), block_size)
         scheduler.add_seq_group(seq_group)
@@ -347,7 +347,7 @@ def test_prefill_schedule_max_prompt_len():
     Test prompt longer than max_prompt_len is aborted.
     """
     scheduler = initialize_scheduler(max_model_len=30)
-    _, seq_group = create_dummy_prompt(0, prompt_length=60)
+    _, seq_group = create_dummy_prompt("0", prompt_length=60)
     waiting = deque([seq_group])
     budget = create_token_budget()
     remaining_waiting, output = scheduler._schedule_prefills(
@@ -364,7 +364,7 @@ def test_prefill_schedule_token_budget():
     Test token budget respected.
     """
     scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(token_budget=0)
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -419,7 +419,7 @@ def test_prefill_schedule_max_seqs():
     Test max seq respected.
     """
     scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(max_num_seqs=2)
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -453,9 +453,9 @@ def test_prefill_schedule_max_lora():
     """
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
     scheduler = initialize_scheduler(lora_config=lora_config)
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(token_budget=120)
-    curr_loras = set()
+    curr_loras: Set[int] = set()
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
@@ -499,7 +499,7 @@ def test_prefill_schedule_no_block_manager_capacity():
     Test sequence cannot be scheduled due to block manager has no capacity.
     """
     scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@@ -536,7 +536,7 @@ def test_decode_schedule_preempted():
     Test decodes cannot be scheduled and preempted.
     """
     scheduler = initialize_scheduler()
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     for i in range(3):
@@ -577,7 +577,7 @@ def test_decode_swap_beam_search():
     Test best_of > 1 swap out blocks
     """
     scheduler = initialize_scheduler()
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     budget = create_token_budget()
@@ -628,7 +628,7 @@ def test_schedule_decode_blocks_to_copy_update():
     """
     scheduler = initialize_scheduler()
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     scheduler._allocate_and_set_running(seq_group)
@@ -656,10 +656,10 @@ def test_schedule_decode_blocks_to_copy_update():
 
 def test_schedule_swapped_simple():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
@@ -683,10 +683,10 @@ def test_schedule_swapped_simple():
 
 def test_schedule_swapped_max_token_budget():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
         _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
         scheduler._allocate_and_set_running(seq_group)
@@ -717,10 +717,10 @@ def test_schedule_swapped_max_token_budget():
 
 def test_schedule_swapped_max_seqs():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for i in range(4):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
         scheduler._allocate_and_set_running(seq_group)
@@ -750,10 +750,10 @@ def test_schedule_swapped_max_seqs():
 def test_schedule_swapped_max_loras():
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
     scheduler = initialize_scheduler(lora_config=lora_config)
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
-    curr_loras = set()
-    blocks_to_swap_out = []
+    curr_loras: Set[int] = set()
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
@@ -779,10 +779,10 @@ def test_schedule_swapped_max_loras():
 
 def test_schedule_swapped_cannot_swap_in():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
         _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
         scheduler._allocate_and_set_running(seq_group)
@@ -806,10 +806,10 @@ def test_schedule_swapped_cannot_swap_in():
 
 def test_infeasible_swap():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
         _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
         scheduler._allocate_and_set_running(seq_group)
@@ -834,13 +834,13 @@ def test_infeasible_swap():
 
 def test_schedule_swapped_blocks_to_copy():
     scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
     scheduler._swap_out(seq_group, blocks_to_swap_out)
     swapped.append(seq_group)
 
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 2fbf099c5..f249f4b59 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,5 +1,7 @@
 import time
-from typing import Iterable, Optional, Tuple
+from typing import List, Optional
+from typing import Sequence as GenericSequence
+from typing import Tuple
 
 from vllm import SamplingParams
 from vllm.lora.request import LoRARequest
@@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder(
     lora_request: Optional[LoRARequest] = None,
     use_beam_search: bool = False,
     best_of: int = 1,
-) -> Tuple[Sequence, SequenceGroup]:
+) -> Tuple[Sequence, Sequence, SequenceGroup]:
     if not block_size:
         block_size = decoder_prompt_length
 
@@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder(
 
 def create_seq_group(
         seq_prompt_len: int = 1024,
-        seq_output_lens: Iterable[int] = (128, ),
+        seq_output_lens: GenericSequence[int] = (128, ),
         request_id: str = '0',
         seq_id_start: int = 0,
         sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
@@ -98,7 +100,7 @@ def create_seq_group(
 
     prompt_token_ids = [0] * seq_prompt_len
 
-    seqs = []
+    seqs: List[Sequence] = []
     for seq_id_offset, output_len in enumerate(seq_output_lens):
         seq = Sequence(
             seq_id=seq_id_start + seq_id_offset,
@@ -125,7 +127,7 @@ def create_seq_group(
 
 def create_seq_group_encoder_decoder(
         seq_prompt_len: int = 1024,
-        seq_output_lens: Iterable[int] = (128, ),
+        seq_output_lens: GenericSequence[int] = (128, ),
         request_id: str = '0',
         seq_id_start: int = 0,
         sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index b788e253a..964dbc542 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,5 +1,6 @@
 import multiprocessing
 import os
+from typing import Dict, List
 
 import pytest
 import torch
@@ -17,9 +18,9 @@ from vllm.utils import update_environment_variables
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
-    processes = []
+    processes: List[multiprocessing.Process] = []
     for i in range(number_of_processes):
-        env = {}
+        env: Dict[str, str] = {}
         env['RANK'] = str(i)
         env['LOCAL_RANK'] = str(i)
         env['WORLD_SIZE'] = str(number_of_processes)
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 923ad66c2..49d11daca 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -6,7 +6,7 @@ from vllm.utils import cuda_device_count_stateless
 
 
 @ray.remote
-class _CUDADeviceCountStatelessTestActor():
+class _CUDADeviceCountStatelessTestActor:
 
     def get_count(self):
         return cuda_device_count_stateless()
@@ -22,7 +22,8 @@ def test_cuda_device_count_stateless():
     """Test that cuda_device_count_stateless changes return value if
     CUDA_VISIBLE_DEVICES is changed."""
 
-    actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
+    actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
+        num_gpus=2).remote()
     assert sorted(ray.get(
         actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"]
     assert ray.get(actor.get_count.remote()) == 2
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 2d7e3044d..d66b9b0fd 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1,6 +1,7 @@
 # imports for guided decoding tests
 import json
 import re
+from typing import List
 
 import jsonschema
 import openai  # use the official client for correctness check
@@ -453,7 +454,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
                                              max_tokens=5,
                                              temperature=0.0,
                                              stream=True)
-    chunks = []
+    chunks: List[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         chunks.append(chunk.choices[0].text)
@@ -499,7 +500,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
         temperature=0.0,
         stream=True,
     )
-    chunks = []
+    chunks: List[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 8bc4766fc..f848ad51c 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -72,27 +72,27 @@ def ref_single_query_cached_kv_attention(
     block_size = value_cache.shape[3]
     num_seqs = query.shape[0]
 
-    block_tables = block_tables.cpu().tolist()
-    seq_lens = seq_lens.cpu().tolist()
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
     for i in range(num_seqs):
         q = query[i].unsqueeze(0)
-        block_table = block_tables[i]
-        seq_len = int(seq_lens[i])
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])
 
-        keys = []
-        values = []
+        keys_lst: List[torch.Tensor] = []
+        values_lst: List[torch.Tensor] = []
         for j in range(seq_len):
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
 
             k = key_cache[block_number, :, :, block_offset, :]
             k = k.reshape(num_kv_heads, head_size)
-            keys.append(k)
+            keys_lst.append(k)
 
             v = value_cache[block_number, :, :, block_offset]
-            values.append(v)
-        keys = torch.stack(keys, dim=0)
-        values = torch.stack(values, dim=0)
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
         if num_queries_per_kv > 1:
             # Handle MQA and GQA
             keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
@@ -157,14 +157,15 @@ def test_paged_attention(
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
+    block_tables_lst: List[List[int]] = []
     for _ in range(num_seqs):
         block_table = [
             random.randint(0, NUM_BLOCKS - 1)
             for _ in range(max_num_blocks_per_seq)
         ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int)
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
@@ -283,7 +284,7 @@ def ref_multi_query_kv_attention(
     dtype: torch.dtype,
 ) -> torch.Tensor:
     num_seqs = len(cu_seq_lens) - 1
-    ref_outputs = []
+    ref_outputs: List[torch.Tensor] = []
     for i in range(num_seqs):
         start_idx = cu_seq_lens[i]
         end_idx = cu_seq_lens[i + 1]
@@ -303,8 +304,8 @@ def ref_multi_query_kv_attention(
             attn_mask=attn_mask,
         )
         ref_outputs.append(ref_output)
-    ref_output = torch.cat(ref_outputs, dim=0)
-    return ref_output
+
+    return torch.cat(ref_outputs, dim=0)
 
 
 # TODO(woosuk): Add tests for USE_ALIBI=True.
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index 9da13ca6e..402545d19 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention(
     block_size = value_cache.shape[3]
     num_seqs = query.shape[0]
 
-    block_tables = block_tables.cpu().tolist()
-    seq_lens = seq_lens.cpu().tolist()
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
     for i in range(num_seqs):
         q = query[i].unsqueeze(0)
-        block_table = block_tables[i]
-        seq_len = int(seq_lens[i])
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])
 
-        keys = []
-        values = []
+        keys_lst: List[torch.Tensor] = []
+        values_lst: List[torch.Tensor] = []
         for j in range(seq_len):
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
 
             k = key_cache[block_number, :, :, block_offset, :]
             k = k.reshape(num_kv_heads, head_size)
-            keys.append(k)
+            keys_lst.append(k)
 
             v = value_cache[block_number, :, :, block_offset]
-            values.append(v)
-        keys = torch.stack(keys, dim=0)
-        values = torch.stack(values, dim=0)
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
         if num_queries_per_kv > 1:
             # Handle MQA and GQA
             keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
@@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill(
         value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
 
     ref_output = ref_multi_query_kv_attention(
-        cu_seq_lens,
+        cu_seq_lens.tolist(),
         query,
         key,
         value,
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 29572cfa5..23b6baa60 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -1,5 +1,5 @@
 import random
-from typing import Tuple
+from typing import List, Tuple
 
 import pytest
 import torch
@@ -63,7 +63,7 @@ def test_copy_blocks(
     src_blocks = random.sample(range(num_blocks), num_mappings)
     remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
     dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
-    block_mapping = []
+    block_mapping: List[Tuple[int, int]] = []
     for i in range(num_mappings):
         src = src_blocks[i]
         dst1 = dst_blocks[2 * i]
@@ -131,8 +131,8 @@ def test_reshape_and_cache(
     torch.set_default_device(device)
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
-    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)
 
     qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
     _, key, value = qkv.unbind(dim=1)
@@ -170,12 +170,12 @@ def test_reshape_and_cache(
     # Run the reference implementation.
     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
     block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indicies = block_indicies.cpu().tolist()
+    block_indicies_lst = block_indicies.cpu().tolist()
     block_offsets = slot_mapping % block_size
-    block_offsets = block_offsets.cpu().tolist()
+    block_offsets_lst = block_offsets.cpu().tolist()
     for i in range(num_tokens):
-        block_idx = block_indicies[i]
-        block_offset = block_offsets[i]
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
         cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
 
@@ -224,8 +224,10 @@ def test_reshape_and_cache_flash(
 
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
-    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device)
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
 
     qkv = torch.randn(num_tokens,
                       3,
@@ -257,13 +259,13 @@ def test_reshape_and_cache_flash(
                                 slot_mapping, kv_cache_dtype)
 
     # Run the reference implementation.
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
-    block_indicies = block_indicies.cpu().tolist()
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indicies_lst = block_indicies.cpu().tolist()
     block_offsets = slot_mapping % block_size
-    block_offsets = block_offsets.cpu().tolist()
+    block_offsets_lst = block_offsets.cpu().tolist()
     for i in range(num_tokens):
-        block_idx = block_indicies[i]
-        block_offset = block_offsets[i]
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
         cloned_key_cache[block_idx, block_offset, :, :] = key[i]
         cloned_value_cache[block_idx, block_offset, :, :] = value[i]
 
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 777138ace..4d09cd8ce 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -17,13 +17,13 @@ capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 
 
-def to_fp8(tensor: torch.tensor):
+def to_fp8(tensor: torch.Tensor):
     finfo = torch.finfo(torch.float8_e4m3fn)
     return torch.round(tensor.clamp(
         min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
 
 
-def to_int8(tensor: torch.tensor):
+def to_int8(tensor: torch.Tensor):
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index 22772d4ea..cd06c2717 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -25,7 +25,7 @@ def ref_paged_attn(
     block_tables = block_tables.cpu().numpy()
     _, block_size, num_kv_heads, head_size = key_cache.shape
 
-    outputs = []
+    outputs: List[torch.Tensor] = []
     start_idx = 0
     for i in range(num_seqs):
         query_len = query_lens[i]
@@ -70,7 +70,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("dtype", DTYPES)
 @torch.inference_mode
 def test_flash_attn_with_paged_kv(
-    kv_lens: List[Tuple[int, int]],
+    kv_lens: List[int],
     num_heads: Tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index e564e3251..4c8365992 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -1,5 +1,5 @@
 from itertools import accumulate, product
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import pytest
 import torch
@@ -126,7 +126,7 @@ def test_batched_rotary_embedding(
                                       query,
                                       key,
                                       offsets=torch.zeros(batch_size * seq_len,
-                                                          dtype=int,
+                                                          dtype=torch.long,
                                                           device=device))
     # Compare the results.
     assert torch.allclose(out_query,
@@ -214,20 +214,16 @@ def test_batched_rotary_embedding_multi_lora(
 def test_rope_module_cache():
     MAX_POSITIONS = [123, 1234]
     BASES = [10000, 1000000]
-    ROPE_SCALINGS = [
-        None, {
-            "type": "linear",
-            "factor": (1, )
-        }, {
-            "type": "dynamic",
-            "factor": 1
-        }
-    ]
-    settings = [
-        HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
-        ROPE_SCALINGS, DTYPES
-    ]
-    rope_setting_id_map = {}
+    ROPE_SCALINGS = (None, {
+        "type": "linear",
+        "factor": (1, )
+    }, {
+        "type": "dynamic",
+        "factor": 1
+    })
+    settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
+                ROPE_SCALINGS, DTYPES)
+    rope_setting_id_map: Dict[str, int] = {}
     for setting in product(*settings):
         head_size, rotary_dim, max_position, base, \
             is_neox_stype, rope_scaling, dtype = setting
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 522c635b8..4eab73a71 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -2,6 +2,7 @@ import contextlib
 import gc
 import tempfile
 from collections import OrderedDict
+from typing import Dict, List, TypedDict
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -24,7 +25,18 @@ from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
 
-LONG_LORA_INFOS = [{
+
+class ContextIDInfo(TypedDict):
+    lora_id: int
+    context_length: str
+
+
+class ContextInfo(TypedDict):
+    lora: str
+    context_length: str
+
+
+LONG_LORA_INFOS: List[ContextIDInfo] = [{
     "lora_id": 1,
     "context_length": "16k",
 }, {
@@ -207,7 +219,7 @@ def long_context_infos(long_context_lora_files_16k_1,
                        long_context_lora_files_16k_2,
                        long_context_lora_files_32k):
     cleanup()
-    infos = {}
+    infos: Dict[int, ContextInfo] = {}
     for lora_checkpoint_info in LONG_LORA_INFOS:
         lora_id = lora_checkpoint_info["lora_id"]
         if lora_id == 1:
@@ -226,7 +238,7 @@ def long_context_infos(long_context_lora_files_16k_1,
 
 
 @pytest.fixture
-def llama_2_7b_engine_extra_embeddings() -> nn.Module:
+def llama_2_7b_engine_extra_embeddings():
     cleanup()
     get_model_old = get_model
 
@@ -244,7 +256,6 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:
 
 
 @pytest.fixture
-def llama_2_7b_model_extra_embeddings(
-        llama_2_7b_engine_extra_embeddings) -> nn.Module:
+def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
     yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
            model_runner.model)
diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
index 653e68274..61b8899f0 100644
--- a/tests/lora/data/long_context_test_data.py
+++ b/tests/lora/data/long_context_test_data.py
@@ -1,7 +1,29 @@
 # ruff: noqa
 """This file contains a dictionary of prompts and golden responses."""
 
-prompts_and_responses = {
+from typing import Dict, List, TypedDict
+
+
+class DateJSON(TypedDict):
+    day: int
+    month: int
+    year: int
+
+
+class AnswerJSON(TypedDict):
+    nationality: str
+    date_of_birth: DateJSON
+    date_of_death: DateJSON
+    politician: bool
+    sportsperson: bool
+
+
+class PromptResponse(TypedDict):
+    prompt: str
+    golden_answer: AnswerJSON
+
+
+prompts_and_responses: Dict[str, List[PromptResponse]] = {
     "16k": [{
         "prompt":
         "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]",
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 5ab863eea..e1b81655c 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 
 import vllm
@@ -10,7 +12,7 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B"
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
@@ -30,7 +32,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py
index bd8cc98ef..de4cbea80 100644
--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -6,7 +8,7 @@ MODEL_PATH = "THUDM/chatglm3-6b"
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
@@ -26,7 +28,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 0082c6e74..709246179 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,10 +1,12 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest
 
 MODEL_PATH = "google/gemma-7b"
 
 
-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         "Quote: Imagination is",
         "Quote: Be yourself;",
@@ -17,7 +19,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py
index 7d37aa647..ec9776b77 100644
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@@ -26,7 +26,7 @@ def get_lora_model(model_id: str, target_modules: List[str], rank: int):
     return lora_model
 
 
-def do_sample(llm,
+def do_sample(llm: vllm.LLM,
               lora_path: Optional[str] = None,
               lora_id: Optional[int] = None,
               logprobs: int = 0,
@@ -42,8 +42,8 @@ def do_sample(llm,
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
-    generated_logprobs = []
+    generated_texts: List[str] = []
+    generated_logprobs: List[List[List[int]]] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index fc4445c65..4b489670f 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -109,7 +109,7 @@ def populate_loras(
 
     for slot_idx, lora_id in enumerate(id_to_index):
         if lora_id is not None:
-            subloras = []
+            subloras: List[LoRALayerWeights] = []
             sublora_len = layer_weights.shape[0] // repeats
             for i in range(repeats):
                 sublora = DummyLoRAManager().init_random_lora(
@@ -158,7 +158,10 @@ def create_random_inputs(
 
     low, high = input_range
 
-    inputs, index_mapping, prompt_mapping = [], [], []
+    inputs: List[torch.Tensor] = []
+    index_mapping: List[int] = []
+    prompt_mapping: List[int] = []
+
     for _ in range(num_inputs):
         if input_type == torch.int:
             inputs.append(
@@ -222,7 +225,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
 
         lora_result = lora_embedding(torch.cat(inputs))
 
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = embedding(input_)
@@ -356,7 +359,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
 
         lora_result = lora_embedding(torch.cat(original_inputs))
 
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, original_input_, lora_id in zip(inputs, original_inputs,
                                                     prompt_mapping):
             lora = lora_dict[lora_id]
@@ -482,7 +485,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
 
         logits_processor.org_vocab_size = (vocab_size +
                                            lora_config.lora_extra_vocab_size)
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(hidden_states=input_,
@@ -598,7 +601,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = linear(input_)[0]
@@ -729,7 +732,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             result = linear(input_)[0]
             subloras = sublora_dict[lora_id]
@@ -885,9 +888,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
     computed_added_vocab_size = 0
     vocab_size_padded = -1
 
-    all_org_tokens = []
-    all_added_tokens = []
-    token_ids = []
+    all_org_tokens: List[int] = []
+    all_added_tokens: List[int] = []
+    token_ids: List[int] = []
 
     for tp_rank in range(tp_size):
         with patch(
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index 7143a99be..ad8490353 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import ray
 
@@ -9,7 +11,7 @@ from .conftest import cleanup
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 
 
-def do_sample(llm, lora_path: str, lora_id: int):
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
@@ -27,7 +29,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index b58145eda..b50784a20 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -77,7 +77,7 @@ def evaluate_json_response(model_response, golden_response):
 
 
 def generate(
-    llm,
+    llm: vllm.LLM,
     inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
 ):
     prompts, sampling_param, lora_request = inputs
@@ -159,7 +159,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
         non-batched generation.
     """
     # Create non batched results first to compare against batched results
-    non_batched_results = []
+    non_batched_results: List[str] = []
 
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
@@ -172,7 +172,8 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
     # Create batched results
     # Each element of the batch must be
     # (prompt, prompt_sampling_params, prompt_lora_request)
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
         batched_prompts.extend([
@@ -196,7 +197,8 @@ def test_self_consistency(lora_llm, long_context_infos):
     num_loras = len(long_context_infos)
 
     # Create results in order of long_context_infos
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
         batched_prompts.extend([
@@ -244,7 +246,7 @@ def test_quality(lora_llm, long_context_infos):
     The test is expected to run for about 1 minute on a p4de.24xlarge
     instance.
     """
-    scores = []
+    scores: List[float] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
         for prompt_and_response in prompts_and_responses[context_len]:
@@ -277,7 +279,8 @@ def test_max_len(lora_llm, long_context_infos):
             generate(lora_llm, (bad_prompt, sampling_params, lora_request))
 
     # Also test batched
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
     for lora_id_with_bad_inputs in long_context_infos:
         for lora_id, info in long_context_infos.items():
             context_len = info["context_length"]
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index d4d1665b6..3514dcb7a 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 
 from vllm.lora.models import LoRAModel
@@ -17,7 +19,7 @@ def test_load_checkpoints(
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules = []
+    expected_lora_modules: List[str] = []
     for module in supported_lora_modules:
         if module in packed_modules_mapping:
             expected_lora_modules.extend(packed_modules_mapping[module])
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index c08eee991..51a56b121 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,5 +1,5 @@
 import os
-from typing import List
+from typing import Dict, List
 
 import pytest
 import torch
@@ -62,7 +62,7 @@ def test_from_lora_tensors(sql_lora_files):
 
 def create_lora(lora_id: int, model: nn.Module,
                 sub_modules: List[str]) -> LoRAModel:
-    loras = {}
+    loras: Dict[str, LoRALayerWeights] = {}
     for name in sub_modules:
         w = model.get_submodule(name).weight
         loras[name] = LoRALayerWeights(
@@ -83,7 +83,7 @@ def create_packed_lora(
     empty_replaced_module_name=None,
 ) -> LoRAModel:
     w = model.get_submodule(module_name).weight
-    loras = {}
+    loras: Dict[str, LoRALayerWeights] = {}
     for replaced_module_name in replaced_module_names:
         if replaced_module_name == empty_replaced_module_name:
             continue
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index f6a8a50fa..e7e7724fc 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch
 
@@ -7,7 +9,7 @@ from vllm.lora.request import LoRARequest
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
-def do_sample(llm, lora_path: str, lora_id: int):
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
         "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
@@ -20,7 +22,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index a2b42ce4c..733eff48a 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -6,7 +8,7 @@ MODEL_PATH = "microsoft/phi-2"
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
             sql_prompt=
@@ -35,7 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
         if lora_id else None,
     )
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 3d86a4366..8fd968c69 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -25,7 +25,10 @@ MODELS: List[ModelWithQuantization] = [
 ]
 
 
-def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
+def do_sample(llm: vllm.LLM,
+              lora_path: str,
+              lora_id: int,
+              max_tokens: int = 256) -> List[str]:
     raw_prompts = [
         "Give me an orange-ish brown color",
         "Give me a neon pink color",
@@ -45,7 +48,7 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 280e0f204..b73cf5bf5 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import torch
 
@@ -9,13 +9,13 @@ class DummyLoRAManager:
 
     def __init__(self):
         super().__init__()
-        self._loras = {}
+        self._loras: Dict[str, LoRALayerWeights] = {}
 
     def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
         self._loras[module_name] = lora
 
-    def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
-        return self._loras.get(module_name, None)
+    def get_module_lora(self, module_name: str) -> LoRALayerWeights:
+        return self._loras[module_name]
 
     def init_random_lora(self,
                          module_name: str,
@@ -68,11 +68,11 @@ class DummyLoRAManager:
         module_name: str,
         input_dim: int,
         output_dims: List[int],
-        noop_lora_index: List[int] = None,
-        rank=8,
+        noop_lora_index: Optional[List[int]] = None,
+        rank: int = 8,
     ):
-        base_loras = []
-        noop_lora_index = set(noop_lora_index or [])
+        base_loras: List[LoRALayerWeights] = []
+        noop_lora_index_set = set(noop_lora_index or [])
 
         for i, out_dim in enumerate(output_dims):
             base_lora = self.init_lora(
@@ -80,7 +80,7 @@ class DummyLoRAManager:
                 input_dim,
                 out_dim,
                 rank=rank,
-                noop=i in noop_lora_index,
+                noop=i in noop_lora_index_set,
             )
             base_loras.append(base_lora)
         packed_lora = PackedLoRALayerWeights.pack(base_loras)
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 2b5609188..4ab968c01 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -3,6 +3,7 @@
 Note: these tests will only pass on L4 GPU.
 """
 import os
+from typing import List
 
 import pytest
 import torch
@@ -100,7 +101,7 @@ def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
     ]
 
     params = SamplingParams(max_tokens=20, temperature=0)
-    generations = []
+    generations: List[str] = []
     # Note: these need to be run 1 at a time due to numerical precision,
     # since the expected strs were generated this way.
     for prompt in formatted_prompts:
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 305596e16..7985001d3 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -2,8 +2,11 @@
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
+from typing import List
+
 import pytest
 
+from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager_v1 import CachedBlockAllocator
 from vllm.utils import Device
 
@@ -43,7 +46,7 @@ def test_block_allocator(
 def test_eviction(num_blocks: int, ):
     block_size = 16
     block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
-    blocks = []
+    blocks: List[PhysicalTokenBlock] = []
 
     for i in range(num_blocks):
         # use i as the block_hash
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 6820b2728..b63a8d01d 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -4,6 +4,7 @@ Run `pytest tests/quantization/test_configs.py --forked`.
 """
 
 from dataclasses import dataclass
+from typing import Tuple
 
 import pytest
 
@@ -51,7 +52,7 @@ MODEL_ARG_EXPTYPES = [
 
 
 @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
-def test_auto_gptq(model_arg_exptype: str) -> None:
+def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
 
     try:
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 233540cdc..02a953da0 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch
 
@@ -62,21 +64,22 @@ def test_get_prompt_logprobs(
         for logprobs in result.outputs[0].logprobs:
             assert len(logprobs) == num_top_logprobs
         output_text = result.outputs[0].text
-        output_string_from_most_likely_tokens = []
+        output_string_from_most_likely_tokens_lst: List[str] = []
         for top_logprobs in result.outputs[0].logprobs:
             top_logprob = next(iter(top_logprobs.values()))
-            output_string_from_most_likely_tokens.append(
+            output_string_from_most_likely_tokens_lst.append(
                 top_logprob.decoded_token)
 
         if detokenize:
             output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens)
+                output_string_from_most_likely_tokens_lst)
             assert output_text == output_string_from_most_likely_tokens, (
                 "The output text from the top logprob for each token position "
                 "should be the same as the output text in the result.")
         else:
             assert output_text == ''
-            assert output_string_from_most_likely_tokens == [None] * max_tokens
+            assert output_string_from_most_likely_tokens_lst == ([None] *
+                                                                 max_tokens)
 
         # The first prompt logprob is always None
         assert result.prompt_logprobs[0] is None
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 00a237950..6dd643bbe 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -246,8 +246,8 @@ def test_rejection_sampling_approximates_target_distribution(
         draft_and_target_probs_equal)
 
     sample_sizes = [10, 100, 1_000, 10_000, 100_000]
-    distance_wrt_reference = []
-    distance_wrt_target = []
+    distance_wrt_reference: List[float] = []
+    distance_wrt_target: List[float] = []
 
     for num_samples in sample_sizes:
         (reference_vs_rejsample_dist,
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index ddc66aa28..c6ef4358e 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,6 +1,6 @@
 import itertools
 import random
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 from unittest.mock import patch
 
 import pytest
@@ -49,8 +49,8 @@ def _do_sample(
     sampling_params: SamplingParams,
     device: str,
 ):
-    seq_group_metadata_list = []
-    seq_lens = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -212,7 +212,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
         batch_size = random.randint(1, 128)
 
         expected_penalization = []
-        sequence_metadata_list = []
+        sequence_metadata_list: List[SequenceGroupMetadata] = []
         # 20% chance to generate seq group metadata list with all prompts
         is_prompt = random.random() < 0.2
         while batch_size > 0:
@@ -232,8 +232,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
                 eos_token_id=eos_token_id,
                 stop_token_ids=stop_token_ids)
 
-            seq_data = {}
-            seq_group_penalization = []
+            seq_data: Dict[int, SequenceData] = {}
+            seq_group_penalization: List[bool] = []
             for _ in range(num_seqs):
                 num_input = random.randint(1, 100)
                 num_generated = 0 if is_prompt else random.randint(1, 100)
@@ -392,17 +392,16 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
     else:
         test_cases = [generate_test_case()]
 
-    def run_test_case(*,
-                      expected_penalization=None,
-                      seq_group_metadata_list=None):
+    def run_test_case(*, expected_penalization: List[bool],
+                      seq_group_metadata_list: List[SequenceGroupMetadata]):
         assert expected_penalization, \
             "Invalid test case, need expected_penalization"
         assert seq_group_metadata_list, \
             "Invalid test case, need seq_group_metadata_list"
 
         batch_size = 0
-        seq_lens = []
-        sampling_params_per_row = []
+        seq_lens: List[int] = []
+        sampling_params_per_row: List[SamplingParams] = []
         for sgm in seq_group_metadata_list:
             sampling_params = sgm.sampling_params
 
@@ -472,15 +471,15 @@ def test_sampler_mixed(seed: int, device: str):
     batch_size = random.randint(1, 256)
     input_tensor, fake_logits, sampler = _prepare_test(batch_size)
 
-    seq_group_metadata_list = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
     expected_tokens: List[Optional[List[int]]] = []
-    seq_lens = []
+    seq_lens: List[int] = []
     for i in range(batch_size):
         expected: Optional[List[int]] = None
         sampling_type = random.randint(0, 3)
         if sampling_type == 0:
             sampling_params = SamplingParams(temperature=0)
-            expected = [torch.argmax(fake_logits[i], dim=-1).item()]
+            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
         elif sampling_type in (1, 2):
             n = random.randint(1, 10)
             sampling_params = SamplingParams(
@@ -536,15 +535,18 @@ def test_sampler_mixed(seed: int, device: str):
                 ]
                 continue
 
+            expected_tokens_item = expected_tokens[i]
+            assert expected_tokens_item is not None
+
             for n, nth_output in enumerate(sequence_output.samples):
                 if (metadata.sampling_params.temperature == 0
                         or metadata.sampling_params.seed is not None):
                     # Ensure exact matches for greedy or random with seed
-                    assert nth_output.output_token == expected_tokens[i][n]
+                    assert nth_output.output_token == expected_tokens_item[n]
                 else:
                     # For non-seeded random check that one of the high-logit
                     # tokens were chosen
-                    assert nth_output.output_token in expected_tokens[i]
+                    assert nth_output.output_token in expected_tokens_item
 
     # Test batch
     test_sampling()
@@ -588,8 +590,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     warpers = generation_model._get_logits_warper(generation_config)
     assert len(warpers) == 2  # top_p and top_k
 
-    seq_group_metadata_list = []
-    seq_lens = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -622,6 +624,9 @@ def test_sampler_top_k_top_p(seed: int, device: str):
 
     with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
         sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+    assert sample_probs is not None
+
     hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
     hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
     assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index f8a6de546..86103cf85 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -118,16 +118,17 @@ class AsyncLLM:
             raise ValueError("The lengths of prompts and "
                              "sampling_params must be the same.")
 
-        async def get_output(prompt, sampling_param) -> str:
+        async def get_output(prompt, sampling_param) -> RequestOutput:
             request_id = random_uuid()
             results_generator = self.llm_engine.generate(
                 prompt, sampling_param, request_id)
             final_output = None
             async for request_output in results_generator:
                 final_output = request_output
+            assert final_output is not None
             return final_output
 
-        outputs = []
+        outputs: List[RequestOutput] = []
         try:
             for i in range(num_requests):
                 prompt = prompts[i] if prompts is not None else None
@@ -208,8 +209,8 @@ def maybe_assert_ngram_worker(llm):
 def get_output_from_llm_generator(
         llm_generator, prompts,
         sampling_params) -> Tuple[List[str], List[List[int]]]:
-    tokens = []
-    token_ids = []
+    tokens: List[str] = []
+    token_ids: List[List[int]] = []
     for llm in llm_generator():
         maybe_assert_ngram_worker(llm)
 
@@ -300,8 +301,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
     nvmlInit()
     start_time = time.time()
     while True:
-        output = {}
-        output_raw = {}
+        output: Dict[int, str] = {}
+        output_raw: Dict[int, float] = {}
         for device in devices:
             dev_handle = nvmlDeviceGetHandleByIndex(device)
             mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 43cfd78dd..42dd90422 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch
 
@@ -38,14 +40,14 @@ def test_get_token_ids_to_score(k: int):
         device='cuda',
     )
 
-    expected_output = [
+    expected_output: List[List[int]] = [
         [],
     ]
     for i in range(proposal_token_ids.shape[0]):
         expected_output.append(proposal_token_ids[:i + 1].tolist())
 
     scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
-    actual_output = scorer._get_token_ids_to_score(proposal_token_ids)  # pylint: disable=protected-access
+    actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist())  # pylint: disable=protected-access
 
     actual_output = [
         x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 6cea6668a..a6eb628f9 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -1,11 +1,12 @@
 import random
+from typing import Dict, List
 from unittest.mock import MagicMock
 
 import pytest
 import torch
 
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -210,7 +211,7 @@ def test_same_output_for_multi_step():
 
     # Run single-step repeatedly.
     zero_kv_cache(worker.cache_engine)
-    single_step_output = []
+    single_step_output: List[SamplerOutput] = []
     continuations = [[1] for _ in prompts]
     set_random_seed(seed)
 
@@ -232,11 +233,15 @@ def test_same_output_for_multi_step():
             continuations[i].append(seq_group_output.samples[0].output_token)
 
     # Get token ids and logprobs for comparison.
-    multi_step_output_logprobs = [[] for _ in prompts]
-    single_step_output_logprobs = [[] for _ in prompts]
-
-    multi_step_output_token_ids = [[] for _ in prompts]
-    single_step_output_token_ids = [[] for _ in prompts]
+    multi_step_output_logprobs: List[List[Dict[int,
+                                               Logprob]]] = [[]
+                                                             for _ in prompts]
+    single_step_output_logprobs: List[List[Dict[int,
+                                                Logprob]]] = [[]
+                                                              for _ in prompts]
+
+    multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
+    single_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
     for i, _ in enumerate(prompts):
         for multi_step, single_step in zip(multi_step_output,
                                            single_step_output):
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index ef9d32f73..afaeffc96 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,5 +1,6 @@
 import random
 from types import SimpleNamespace
+from typing import Dict, List
 from unittest.mock import MagicMock
 
 import pytest
@@ -7,7 +8,7 @@ import torch
 
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                       SpecDecodeWorkerMetrics)
@@ -103,7 +104,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
             seq_group_metadata_list=seq_group_metadata_list,
             num_lookahead_slots=k))
 
-    seen_contexts = []
+    seen_contexts: List[List[int]] = []
 
     call_args_list = target_worker.execute_model.call_args_list
     assert len(call_args_list) == 1
@@ -116,7 +117,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
             for seq_data in seq_group_metadata.seq_data.values():
                 seen_contexts.append(seq_data.get_token_ids())
 
-    expected_seen_contexts = []
+    expected_seen_contexts: List[List[int]] = []
 
     for prompt, prev_generated, draft_tokens in zip(
             prompts, prev_output_tokens, proposal_token_ids.tolist()):
@@ -310,8 +311,14 @@ def test_correctly_formats_output(k: int, batch_size: int):
         next(iter(seq_group_metadata.seq_data.keys()))
         for seq_group_metadata in seq_group_metadata_list
     ]
-    actual_output_by_seq = {seq_id: [] for seq_id in seq_ids}
-    expected_output_by_seq = {seq_id: [] for seq_id in seq_ids}
+    actual_output_by_seq: Dict[int, List[SequenceOutput]] = {
+        seq_id: []
+        for seq_id in seq_ids
+    }
+    expected_output_by_seq: Dict[int, List[SequenceOutput]] = {
+        seq_id: []
+        for seq_id in seq_ids
+    }
 
     for step in output:
         for seq_group in step:
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index d52b22c30..ce5b34783 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,5 +1,7 @@
 from itertools import count
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import TypeVar, Union
 from unittest.mock import MagicMock
 
 import torch
@@ -14,6 +16,8 @@ from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.worker import Worker
 
+T = TypeVar("T", bound=Worker)
+
 
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size
@@ -56,13 +60,13 @@ def zero_kv_cache(cache_engine: CacheEngine):
         value_blocks.zero_()
 
 
-def create_worker(cls: type,
+def create_worker(cls: Callable[..., T],
                   model_name: str,
                   block_size: int,
                   num_gpu_blocks: int,
                   seed: int,
                   is_driver_worker: bool = True,
-                  enforce_eager: bool = True):
+                  enforce_eager: bool = True) -> T:
     engine_args = EngineArgs(
         model=model_name,
         seed=seed,
@@ -159,8 +163,8 @@ def assert_logprobs_dict_allclose(
 
 def create_sampler_output_list(
         token_ids: torch.Tensor,
-        probs: Iterable[Optional[torch.Tensor]],
-        logprobs: Iterable[Optional[torch.Tensor]],
+        probs: GenericSequence[Optional[torch.Tensor]],
+        logprobs: GenericSequence[Optional[torch.Tensor]],
         seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
     num_steps, batch_size = token_ids.shape
     token_ids_by_step = token_ids.tolist()
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 0fbe3dae1..fe413d122 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -51,7 +51,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
         max_input_length=None,
     )
 
-    hashes = []
+    hashes: List[List[List[int]]] = []
 
     for prefix in prefixes:
         for lora_int_id in concurrent_lora_int_ids:
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 74f1125fb..52aa73761 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -47,6 +47,7 @@ def test_default_vllm_root_logger_configuration():
     assert not logger.propagate
 
     handler = logger.handlers[0]
+    assert isinstance(handler, logging.StreamHandler)
     assert handler.stream == sys.stdout
     assert handler.level == logging.INFO
 
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 8d019fe5f..12e5ae85a 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -153,8 +153,8 @@ def test_decode_sequence_logprobs(complete_sequence: str,
     # Run sequentially.
     seq = create_sequence()
     dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
-    sequential_logprobs_text_chosen_token = []
-    sequential_logprobs_text_other_token = []
+    sequential_logprobs_text_chosen_token: List[str] = []
+    sequential_logprobs_text_other_token: List[str] = []
     for new_token, logprobs in zip(complete_sequence_token_ids,
                                    dummy_logprobs):
         seq.append_token_id(new_token, logprobs)
diff --git a/tests/utils.py b/tests/utils.py
index c84364d20..f2b2d22b1 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -79,7 +79,7 @@ class RemoteOpenAIServer:
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
 
-        self._runner = self._RemoteRunner.remote(
+        self._runner = self._RemoteRunner.remote(  # type: ignore
             cli_args,
             wait_url=self.url_for("health"),
             wait_timeout=self.MAX_SERVER_START_WAIT_S)
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 514a57e17..dd0d3bf50 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch
 
@@ -35,8 +37,8 @@ def test_prepare_prompt(batch_size):
         enable_chunked_prefill=False,
     )
 
-    seq_lens = []
-    seq_group_metadata_list = []
+    seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
     block_tables = {0: [1]}
     for i in range(batch_size):
         # make sure all tokens fit into one block
@@ -151,15 +153,14 @@ def test_prepare_decode_cuda_graph(batch_size):
         enable_chunked_prefill=False,
     )
 
-    context_lens = []
-    seq_group_metadata_list = []
+    context_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
     # Assume each seq group finishes prefill.
     for i in range(batch_size):
         # make sure all tokens fit into one block
         context_len = i % (model_runner.block_size - 1) + 1
         context_lens.append(context_len)
-        seq_data = list(range(context_len))
-        seq_data = SequenceData(seq_data)
+        seq_data = SequenceData(list(range(context_len)))
         seq_data.update_num_computed_tokens(context_len)
         # Append one token ID since prefill is finished.
         seq_data.append_token_id(1, 0)
@@ -257,7 +258,7 @@ def test_empty_seq_group():
         dtype="float16",
         enforce_eager=False,
     )
-    seq_group_metadata_list = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
     model_input = model_runner._prepare_model_input(seq_group_metadata_list)
     input_tokens, input_positions, attn_metadata, slot_mapping = (
         model_input.input_tokens,
@@ -310,10 +311,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     )
 
     # Add prefill requests.
-    seq_lens = []
-    seq_group_metadata_list = []
-    prefill_metadata_list = []
-    decode_metadata_list = []
+    seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    prefill_metadata_list: List[SequenceGroupMetadata] = []
+    decode_metadata_list: List[SequenceGroupMetadata] = []
     block_tables = {0: [1]}
     prefill_batch_size = batch_size // 2
     decode_batch_size = batch_size - prefill_batch_size
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 4b08cce99..c01e0a0a3 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -245,7 +245,7 @@ def _make_alibi_bias(
     dtype: torch.dtype,
     seq_lens: List[int],
 ) -> List[torch.Tensor]:
-    attn_biases = []
+    attn_biases: List[torch.Tensor] = []
     for seq_len in seq_lens:
         bias = torch.arange(seq_len, dtype=dtype)
         # NOTE(zhuohan): HF uses
@@ -271,7 +271,7 @@ def _make_sliding_window_bias(
     window_size: Optional[int],
     dtype: torch.dtype,
 ) -> List[torch.Tensor]:
-    attn_biases = []
+    attn_biases: List[torch.Tensor] = []
     for seq_len in seq_lens:
         tensor = torch.full(
             (1, seq_len, seq_len),
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 99a3e88bc..0fecd9f6e 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -431,8 +431,8 @@ def _make_alibi_bias(
     num_kv_heads: int,
     dtype: torch.dtype,
     seq_lens: List[int],
-) -> LowerTriangularMaskWithTensorBias:
-    attn_biases = []
+) -> List[AttentionBias]:
+    attn_biases: List[AttentionBias] = []
     for seq_len in seq_lens:
         bias = torch.arange(seq_len, dtype=dtype)
         # NOTE(zhuohan): HF uses
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 26f378ba2..d705f3d91 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -252,7 +252,7 @@ class BlockTable:
     def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
                                        token_ids: List[int],
                                        device: Device) -> List[Block]:
-        blocks = []
+        blocks: List[Block] = []
         for block_token_ids in chunk_list(token_ids, self._block_size):
             if len(block_token_ids) == self._block_size:
                 # If the block is full, create an immutable block.
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index d03378712..50f27bab3 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -111,7 +111,7 @@ class NaiveBlockAllocator(BlockAllocator):
         """
         source_blocks = get_all_blocks_recursively(last_block)
 
-        forked_blocks = []
+        forked_blocks: List[Block] = []
         prev_block = None
         for block in source_blocks:
 
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 88dbbfb2f..2df7d74e4 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -271,7 +271,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
         """
         source_blocks = get_all_blocks_recursively(last_block)
 
-        forked_blocks = []
+        forked_blocks: List[Block] = []
         prev_block = None
         for block in source_blocks:
             refcount = self._refcounter.incr(block.block_id)
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 121092cf1..309775237 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -260,7 +260,7 @@ class BlockSpaceManagerV2(BlockSpaceManager):
         # at max extend.
         if self.enable_caching:
             block_table = self.block_tables[seq.seq_id]
-            block_ids = []
+            block_ids: List[Optional[int]] = []
             for block_id in block_table.physical_block_ids:
                 block_ids.append(block_id)
             self.block_allocator.mark_blocks_as_accessed(
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index e6957b119..75b7c374c 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -2,7 +2,7 @@ import ctypes
 import json
 import os
 from itertools import product
-from typing import Dict, Optional, Sequence
+from typing import Dict, List, Optional, Sequence
 
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -88,7 +88,7 @@ def consumer(batch_tgt: Sequence[int],
 def can_actually_p2p(
     batch_src: Sequence[int],
     batch_tgt: Sequence[int],
-):
+) -> Sequence[bool]:
     """
     Usually, checking if P2P access is enabled can be done by
     `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
@@ -138,7 +138,7 @@ def can_actually_p2p(
     p_tgt.start()
     p_src.join()
     p_tgt.join()
-    result = []
+    result: List[bool] = []
     for src, tgt in zip(batch_src, batch_tgt):
         a = result_queue.get()
         b = result_queue.get()
@@ -188,7 +188,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
         # only the local master process (with local_rank == 0) can
         #  enter this block to calculate the cache
         logger.info("generating GPU P2P access cache in %s", path)
-        cache = {}
+        cache: Dict[str, bool] = {}
         ids = list(range(num_dev))
         # batch of all pairs of GPUs
         batch_src, batch_tgt = zip(*list(product(ids, ids)))
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 50d6719fb..7619c98f2 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -205,7 +205,7 @@ class NCCLLibrary:
             raise e
 
         if so_file not in NCCLLibrary.path_to_dict_mapping:
-            _funcs = {}
+            _funcs: Dict[str, Any] = {}
             for func in NCCLLibrary.exported_functions:
                 f = getattr(self.lib, func.name)
                 f.restype = func.restype
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b2f6478cb..fd64337d4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2,7 +2,7 @@ import time
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional
 from typing import Sequence as GenericSequence
-from typing import Type, TypeVar, Union
+from typing import Set, Type, TypeVar, Union
 
 from transformers import GenerationConfig, PreTrainedTokenizer
 
@@ -973,7 +973,7 @@ class LLMEngine:
     def remove_lora(self, lora_id: int) -> bool:
         return self.model_executor.remove_lora(lora_id)
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         return self.model_executor.list_loras()
 
     def check_health(self) -> None:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index ae7ae144b..027f5c7e7 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -144,7 +144,7 @@ class Metrics:
 # end-metrics-definitions
 
 
-def build_1_2_5_buckets(max_value: int):
+def build_1_2_5_buckets(max_value: int) -> List[int]:
     """
     Builds a list of buckets with increasing powers of 10 multiplied by 
     mantissa values (1, 2, 5) until the value exceeds the specified maximum.
@@ -155,7 +155,7 @@ def build_1_2_5_buckets(max_value: int):
     """
     mantissa_lst = [1, 2, 5]
     exponent = 0
-    buckets = []
+    buckets: List[int] = []
     while True:
         for m in mantissa_lst:
             value = m * 10**exponent
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index cad44f476..07a68c65a 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from vllm.config import SchedulerConfig
 from vllm.core.scheduler import Scheduler
@@ -146,8 +146,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
 
         # Beam search case
         # Select the child sequences to keep in the sequence group.
-        selected_child_seqs = []
-        unselected_child_seqs = []
+        selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
+        unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
         beam_width = seq_group.sampling_params.best_of
         length_penalty = seq_group.sampling_params.length_penalty
 
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 7a6819c35..91e567924 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -2,6 +2,7 @@ import argparse
 import asyncio
 import sys
 from io import StringIO
+from typing import Awaitable, List
 
 import aiohttp
 
@@ -114,7 +115,7 @@ async def main(args):
     )
 
     # Submit all requests in the file to the engine "concurrently".
-    response_futures = []
+    response_futures: List[Awaitable[BatchRequestOutput]] = []
     for request_json in (await read_file(args.input_file)).strip().split("\n"):
         request = BatchRequestInput.model_validate_json(request_json)
         response_futures.append(run_request(openai_serving_chat, request))
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7cd434fe0..769406124 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -487,7 +487,7 @@ class OpenAIServingChat(OpenAIServing):
             final_res = res
         assert final_res is not None
 
-        choices = []
+        choices: List[ChatCompletionResponseChoice] = []
 
         role = self.get_chat_request_role(request)
         for output in final_res.outputs:
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 5a3448de3..cbf09f173 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -25,7 +25,7 @@ def request_output_to_embedding_response(
     created_time: int,
     model_name: str,
 ) -> EmbeddingResponse:
-    data = []
+    data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
         assert final_res is not None
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index d7794aa7c..8f3c7f769 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -1,4 +1,5 @@
 from typing import List, Optional
+from typing import Sequence as GenericSequence
 
 import torch
 
@@ -120,7 +121,7 @@ class PackedLoRALayerWeights(LoRALayerWeights):
 
     @classmethod
     def pack(
-            cls, loras: List[Optional["LoRALayerWeights"]]
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]]
     ) -> "PackedLoRALayerWeights":
         """Pack a list of LoRAs into a single LoRA.
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 4657757bd..498b2b9dd 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -165,7 +165,7 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
             model = self._lora_manager.model
             supported_lora_modules = model.supported_lora_modules
             packed_modules_mapping = model.packed_modules_mapping
-            expected_lora_modules = []
+            expected_lora_modules: List[str] = []
             for module in supported_lora_modules:
                 if module in packed_modules_mapping:
                     expected_lora_modules.extend(
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index f5b6bdd9f..58c379bcd 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -393,7 +393,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 param_data.copy_(loaded_weight)
                 return
             current_shard_offset = 0
-            shard_offsets = []
+            shard_offsets: List[Tuple[int, int, int]] = []
             for i, output_size in enumerate(self.output_sizes):
                 shard_offsets.append((i, current_shard_offset, output_size))
                 current_shard_offset += output_size
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index ae440743f..599070f15 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -25,24 +25,25 @@ GPTQ_MARLIN_SUPPORTED_SYM = [True]
 
 
 # Permutations for Marlin scale shuffling
-def get_scale_perms(num_bits):
-    scale_perm = []
+def get_scale_perms(num_bits: int):
+    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
     for i in range(4):
         scale_perm_single.extend(
             [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
 
 
-def get_pack_factor(num_bits):
+def get_pack_factor(num_bits: int):
     assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
             ), f"Unsupported num_bits = {num_bits}"
     return 32 // num_bits
 
 
-def marlin_permute_scales(s, size_k, size_n, group_size, num_bits):
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int, num_bits: int):
     scale_perm, scale_perm_single = get_scale_perms(num_bits)
     if group_size < size_k and group_size != -1:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
index 12e77cb71..93f65a20d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
@@ -1,4 +1,6 @@
 """This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
 import numpy
 import torch
 
@@ -11,10 +13,10 @@ import torch
 #
 # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
 # (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms_24(num_bits):
-    perm_list = []
+def get_perms_24(num_bits: int):
+    perm_list: List[int] = []
     for i in range(32):
-        perm1 = []
+        perm1: List[int] = []
         col = i // 4
         col_o = col // 2
         for block in [0, 1]:
@@ -39,18 +41,18 @@ def get_perms_24(num_bits):
 
     perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
     perm = torch.from_numpy(perm)
-    scale_perm = []
+    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
     for i in range(8):
         scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
     return perm, scale_perm, scale_perm_single
 
 
-marlin_24_perm = {}
-marlin_24_scale_perm = {}
-marlin_24_scale_perm_single = {}
+marlin_24_perm: Dict[int, torch.Tensor] = {}
+marlin_24_scale_perm: Dict[int, List[int]] = {}
+marlin_24_scale_perm_single: Dict[int, List[int]] = {}
 for num_bits in [4, 8]:
     perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
     marlin_24_perm[num_bits] = perm_24
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
index 76bd2ff7c..db5e6857a 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
@@ -1,4 +1,6 @@
 """This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
 import numpy
 import torch
 
@@ -11,10 +13,10 @@ import torch
 #
 # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
 # (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms(num_bits):
-    perm_list = []
+def get_perms(num_bits: int):
+    perm_list: List[int] = []
     for i in range(32):
-        perm1 = []
+        perm1: List[int] = []
         col = i // 4
         for block in [0, 1]:
             for row in [
@@ -38,19 +40,19 @@ def get_perms(num_bits):
 
     perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
     perm = torch.from_numpy(perm)
-    scale_perm = []
+    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
     for i in range(4):
         scale_perm_single.extend(
             [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return perm, scale_perm, scale_perm_single
 
 
-marlin_perm = {}
-marlin_scale_perm = {}
-marlin_scale_perm_single = {}
+marlin_perm: Dict[int, torch.Tensor] = {}
+marlin_scale_perm: Dict[int, List[int]] = {}
+marlin_scale_perm_single: Dict[int, List[int]] = {}
 for num_bits in [4, 8]:
     perm, scale_perm, scale_perm_single = get_perms(num_bits)
     marlin_perm[num_bits] = perm
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index a84f56290..e07360a2f 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -174,7 +174,7 @@ def _apply_min_tokens_penalty(
         min_tokens = sampling_params.min_tokens
         token_ids_to_penalize = sampling_params.all_stop_token_ids
         if min_tokens > 0 and token_ids_to_penalize:
-            seqs_to_penalize = []
+            seqs_to_penalize: List[int] = []
             for j, seq_id in enumerate(seq_ids):
                 seq_data = seq_group.seq_data[seq_id]
                 if len(seq_data.output_token_ids) < min_tokens:
@@ -285,7 +285,7 @@ def _greedy_sample(
         same as the length of selected_seq_groups. If the corresponding
         seq_group has do_sample=False, tuple contains ([], [])
     """
-    samples = samples.tolist()
+    samples_lst = samples.tolist()
     sample_idx = 0
     results: SampleResultType = []
     for seq_group in selected_seq_groups:
@@ -298,7 +298,7 @@ def _greedy_sample(
         assert num_parent_seqs == 1, (
             "Greedy sampling should have only one seq.")
         parent_ids = list(range(num_parent_seqs))
-        next_token_ids = [samples[sample_idx]]
+        next_token_ids = [samples_lst[sample_idx]]
         results.append((next_token_ids, parent_ids))
         sample_idx += num_parent_seqs
     return results
@@ -394,7 +394,7 @@ def _beam_search_sample(
             next_token_ids = next_token_ids.tolist()
         else:
             # Generation phase.
-            cumulative_logprobs: List[int] = [
+            cumulative_logprobs: List[float] = [
                 seq_group.seq_data[seq_id].cumulative_logprob
                 for seq_id in seq_ids
             ]
@@ -466,8 +466,9 @@ def _sample_with_torch(
         categorized_seq_group_ids[sampling_type].append(i)
 
     sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    sample_metadata = {}
-    multinomial_samples = {}
+    sample_metadata: Dict[SamplingType,
+                          Tuple[List[int], List[SequenceGroupToSample]]] = {}
+    multinomial_samples: Dict[SamplingType, torch.Tensor] = {}
 
     # Create output tensor for sampled token ids.
     if include_gpu_probs_tensor:
@@ -494,7 +495,7 @@ def _sample_with_torch(
             greedy_samples = torch.argmax(logprobs[long_sample_indices],
                                           dim=-1)
 
-            if include_gpu_probs_tensor:
+            if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
                 sampled_token_ids_tensor[
                     long_sample_indices] = greedy_samples.unsqueeze(-1)
@@ -522,7 +523,7 @@ def _sample_with_torch(
                 probs[long_sample_indices], max_best_of_in_batch,
                 **seeded_args)
 
-            if include_gpu_probs_tensor:
+            if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
                 sampled_token_ids_tensor[
                     long_sample_indices] = multinomial_samples[sampling_type]
@@ -571,7 +572,9 @@ def _sample_with_triton_kernel(
         categorized_seq_group_ids[sampling_type].append(i)
 
     sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    sample_metadata = {}
+    sample_metadata: Dict[SamplingType,
+                          Tuple[List[int], List[SequenceGroupToSample],
+                                torch.Tensor, torch.Tensor]] = {}
     max_best_of_in_batch = 1
 
     # Counterintiutively, having two loops here is actually faster.
@@ -1008,14 +1011,14 @@ def _build_sampler_output(
             speculative decoding rejection sampling.
     """
 
-    sampler_output = []
+    sampler_output: List[CompletionSequenceGroupOutput] = []
     for (seq_group, sample_result, group_prompt_logprobs,
          group_sample_logprobs) in zip(sampling_metadata.seq_groups,
                                        sample_results, prompt_logprobs,
                                        sample_logprobs):
         seq_ids = seq_group.seq_ids
         next_token_ids, parent_ids = sample_result
-        seq_outputs = []
+        seq_outputs: List[SequenceOutput] = []
         for parent_id, next_token_id, logprobs in zip(parent_ids,
                                                       next_token_ids,
                                                       group_sample_logprobs):
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 06de2fcc1..d3babcf9c 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -68,7 +68,7 @@ def _get_model_initialization_kwargs(
         vision_language_config: Optional[VisionLanguageConfig]
 ) -> Dict[str, Any]:
     """Get extra kwargs for model initialization."""
-    extra_kwargs = {}
+    extra_kwargs: Dict[str, Any] = {}
     if hasattr(model_class, "supported_lora_modules"):
         extra_kwargs["lora_config"] = lora_config
     elif lora_config:
@@ -446,7 +446,8 @@ class ShardedStateLoader(BaseModelLoader):
         Filter out all tensors that share the same memory or a subset of the
         memory of another tensor.
         """
-        same_storage_groups = collections.defaultdict(list)
+        same_storage_groups: Dict[Any, List[Tuple[
+            str, torch.Tensor]]] = collections.defaultdict(list)
         for key, tensor in tensors.items():
             if tensor.numel():
                 ptr = tensor.untyped_storage().data_ptr()
@@ -455,7 +456,7 @@ class ShardedStateLoader(BaseModelLoader):
         def get_end_ptr(tensor: torch.Tensor) -> int:
             return tensor.view(-1)[-1].data_ptr() + tensor.element_size()
 
-        result = {}
+        result: Dict[str, torch.Tensor] = {}
         for group in same_storage_groups.values():
             for k, t in group:
                 a, b = t.data_ptr(), get_end_ptr(t)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 827591b22..943022a3f 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -329,7 +329,7 @@ def np_cache_weights_iterator(
     # dumping the same model weights to numpy at the same time.
     with get_lock(model_name_or_path, cache_dir):
         if not os.path.exists(weight_names_file):
-            weight_names = []
+            weight_names: List[str] = []
             for bin_file in hf_weights_files:
                 state = torch.load(bin_file, map_location="cpu")
                 for name, param in state.items():
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 4446914c6..bed6f518c 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -72,11 +72,11 @@ _MODELS = {**_GENERATION_MODELS, **_EMBEDDING_MODELS}
 _OOT_MODELS: Dict[str, Type[nn.Module]] = {}
 
 # Models not supported by ROCm.
-_ROCM_UNSUPPORTED_MODELS = []
+_ROCM_UNSUPPORTED_MODELS: List[str] = []
 
 # Models partially supported by ROCm.
 # Architecture -> Reason.
-_ROCM_PARTIALLY_SUPPORTED_MODELS = {
+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
     "Qwen2ForCausalLM":
     "Sliding window attention is not yet supported in ROCm's flash attention",
     "MistralForCausalLM":
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 313762b13..577761107 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -453,8 +453,8 @@ class ArcticForCausalLM(nn.Module):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        mlp_params_mapping = []
-        expert_params_mapping = []
+        mlp_params_mapping: List[Tuple[str, str, int]] = []
+        expert_params_mapping: List[Tuple[str, str, int]] = []
         num_layers = self.config.num_hidden_layers
 
         for layer in range(num_layers):
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 84786921c..11d88d45e 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -20,7 +20,7 @@
 
 # This file is based on the LLama model definition file in transformers
 """PyTorch Cohere model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 import torch.utils.checkpoint
@@ -352,7 +352,7 @@ class CohereForCausalLM(nn.Module):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        loaded_params = set()
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 27dda00b6..65f4ebec5 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
 from functools import lru_cache
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -363,7 +363,7 @@ class GemmaForCausalLM(nn.Module):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        loaded_params = set()
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for (param_name, shard_name, shard_id) in stacked_params_mapping:
                 if shard_name not in name:
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2f27bf33b..54243bfb1 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -123,7 +123,7 @@ class SequenceData:
             output_token_ids = []
 
         self.prompt_token_ids = prompt_token_ids
-        self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
+        self._prompt_token_ids_tuple = tuple(prompt_token_ids)
         self.output_token_ids = output_token_ids
         self.cumulative_logprob = 0.0
         # The number of tokens that are computed (that run against the model).
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index fe15ea33b..668ceefe6 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,10 +1,10 @@
 import copy
 import weakref
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 
-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
@@ -71,7 +71,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
                                      sample_len)
 
         # Run model sample_len times.
-        model_outputs = []
+        model_outputs: List[SamplerOutput] = []
         for _ in range(sample_len):
             model_output = super().execute_model(
                 execute_model_req=copied_execute_model_req)
@@ -132,7 +132,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
 
         # Shallow-copy the list of SequenceGroupMetadata. This allows us to
         # append tokens and change is_prompt without external side-effects.
-        new_seq_group_metadata_list = []
+        new_seq_group_metadata_list: List[SequenceGroupMetadata] = []
 
         for old_seq_group_metadata in seq_group_metadata_list:
             # We must shallow-copy seq_group_metadata as is_prompt could change.
@@ -140,7 +140,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
             new_seq_group_metadata_list.append(seq_group_metadata)
 
             # We must shallow-copy seq_data as we will append token ids
-            new_seq_data = {}
+            new_seq_data: Dict[int, SequenceData] = {}
             for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
                 new_seq_data[seq_id] = copy.copy(old_seq_data)
                 new_seq_data[
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 33af588d0..23a3e1649 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -48,7 +48,7 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
-    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
+    ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]:
         """NGram match algo to pick proposal candidate. Returns the list of
         sampler output, one per SequenceGroupMetadata.
 
@@ -58,8 +58,8 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
         self._raise_if_unsupported(execute_model_req)
 
         has_spec_out = False
-        token_id_list = []
-        token_prob_list = []
+        token_id_list: List[Optional[torch.Tensor]] = []
+        token_prob_list: List[Optional[torch.Tensor]] = []
         for idx, seq_group_metadata in enumerate(
                 execute_model_req.seq_group_metadata_list):
             seq_data = next(iter(seq_group_metadata.seq_data.values()))
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8b147c806..03fad5663 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -7,8 +7,8 @@ from vllm.config import SpeculativeConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
-                           SequenceGroupMetadata)
+from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
+                           SamplerOutput, SequenceGroupMetadata)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
@@ -516,13 +516,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         topk_indices_by_step = topk_indices_by_step.tolist()
 
         # Construct the output on a per-step, per-sequence basis.
-        sampler_output_list = []
+        sampler_output_list: List[SamplerOutput] = []
         for step_index in range(num_steps):
             if all(token_id == -1
                    for token_id in accepted_token_ids_by_step[step_index]):
                 break
 
-            step_output_token_ids = []
+            step_output_token_ids: List[CompletionSequenceGroupOutput] = []
             for sequence_index in range(batch_size):
                 # Each sequence may have a different num_logprobs; retrieve it.
                 num_logprobs = num_logprobs_per_seq[sequence_index]
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 60ed9d39e..9bbe3f8d1 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -26,10 +26,10 @@ def get_all_num_logprobs(
     sequence.
     """
 
-    all_num_logprobs = []
+    all_num_logprobs: List[int] = []
     for seq_group_metadata in seq_group_metadata_list:
         num_logprobs = seq_group_metadata.sampling_params.logprobs
-        if seq_group_metadata.sampling_params.logprobs is None:
+        if num_logprobs is None:
             num_logprobs = 0
         all_num_logprobs.append(num_logprobs)
 
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index f064c26c3..e8e53f494 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -44,7 +44,7 @@ class Detokenizer:
         read_offset = 0
         next_iter_prefix_offset = 0
         next_iter_read_offset = 0
-        next_iter_tokens = []
+        next_iter_tokens: List[str] = []
         prev_tokens = None
 
         for token_position, prompt_logprobs_for_token in enumerate(
diff --git a/vllm/utils.py b/vllm/utils.py
index b5c42605b..9b39ca77a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -20,12 +20,13 @@ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
 import numpy as np
 import psutil
 import torch
+import torch.types
+from typing_extensions import ParamSpec
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import enable_trace_function_call, init_logger
 
-T = TypeVar("T")
 logger = init_logger(__name__)
 
 STR_DTYPE_TO_TORCH_DTYPE = {
@@ -37,6 +38,10 @@ STR_DTYPE_TO_TORCH_DTYPE = {
     "fp8_e5m2": torch.uint8,
 }
 
+P = ParamSpec('P')
+K = TypeVar("K")
+T = TypeVar("T")
+
 
 class Device(enum.Enum):
     GPU = enum.auto()
@@ -176,7 +181,7 @@ def random_uuid() -> str:
 
 
 @lru_cache(maxsize=None)
-def get_vllm_instance_id():
+def get_vllm_instance_id() -> str:
     """
     If the environment variable VLLM_INSTANCE_ID is set, return it.
     Otherwise, return a random UUID.
@@ -192,7 +197,7 @@ def in_wsl() -> bool:
     return "microsoft" in " ".join(uname()).lower()
 
 
-def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
+def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
     """Take a blocking function, and run it on in an executor thread.
 
     This function prevents the blocking function from blocking the
@@ -200,7 +205,7 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
     The code in this function needs to be thread safe.
     """
 
-    def _async_wrapper(*args, **kwargs) -> asyncio.Future:
+    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
         loop = asyncio.get_event_loop()
         p_func = partial(func, *args, **kwargs)
         return loop.run_in_executor(executor=None, func=p_func)
@@ -325,7 +330,7 @@ def update_environment_variables(envs: Dict[str, str]):
         os.environ[k] = v
 
 
-def chunk_list(lst, chunk_size):
+def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
     """Yield successive chunk_size chunks from lst."""
     return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
 
@@ -336,7 +341,7 @@ def cdiv(a: int, b: int) -> int:
 
 
 def _generate_random_fp8(
-    tensor: torch.tensor,
+    tensor: torch.Tensor,
     low: float,
     high: float,
 ) -> None:
@@ -398,7 +403,10 @@ def create_kv_caches_with_random_flash(
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
     key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
     scale = head_size**-0.5
-    key_caches, value_caches = [], []
+
+    key_caches: List[torch.Tensor] = []
+    value_caches: List[torch.Tensor] = []
+
     for _ in range(num_layers):
         key_value_cache = torch.empty(size=key_value_cache_shape,
                                       dtype=torch_dtype,
@@ -429,7 +437,7 @@ def create_kv_caches_with_random(
     scale = head_size**-0.5
     x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_caches = []
+    key_caches: List[torch.Tensor] = []
     for _ in range(num_layers):
         key_cache = torch.empty(size=key_cache_shape,
                                 dtype=torch_dtype,
@@ -444,7 +452,7 @@ def create_kv_caches_with_random(
         key_caches.append(key_cache)
 
     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_caches = []
+    value_caches: List[torch.Tensor] = []
     for _ in range(num_layers):
         value_cache = torch.empty(size=value_cache_shape,
                                   dtype=torch_dtype,
@@ -484,7 +492,7 @@ def is_pin_memory_available() -> bool:
 
 class CudaMemoryProfiler:
 
-    def __init__(self, device=None):
+    def __init__(self, device: Optional[torch.types.Device] = None):
         self.device = device
 
     def current_memory_usage(self) -> float:
@@ -560,13 +568,13 @@ def get_dtype_size(dtype: torch.dtype) -> int:
     return torch.tensor([], dtype=dtype).element_size()
 
 
-def merge_dicts(dict1: Dict[Any, List[Any]],
-                dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
+def merge_dicts(dict1: Dict[K, List[T]],
+                dict2: Dict[K, List[T]]) -> Dict[K, List[T]]:
     """Merge 2 dicts that have key -> List of items.
 
     When a key conflicts, the values in dict1 is prioritized.
     """
-    merged_dict = defaultdict(list)
+    merged_dict: Dict[K, List[T]] = defaultdict(list)
 
     for key, value in dict1.items():
         merged_dict[key].extend(value)
@@ -577,7 +585,7 @@ def merge_dicts(dict1: Dict[Any, List[Any]],
     return dict(merged_dict)
 
 
-def init_cached_hf_modules():
+def init_cached_hf_modules() -> None:
     """
     Lazy initialization of the Hugging Face modules.
     """
@@ -613,7 +621,7 @@ def find_library(lib_name: str) -> str:
     return locs[0]
 
 
-def find_nccl_library():
+def find_nccl_library() -> str:
     """
     We either use the library file specified by the `VLLM_NCCL_SO_PATH`
     environment variable, or we find the library file brought by PyTorch.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 476e9ba3b..d0baa4337 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -779,8 +779,8 @@ class ModelRunner:
         # that will have unique loras, an therefore the max amount of memory
         # consumption create dummy lora request copies from the lora request
         # passed in, which contains a lora from the lora warmup path.
-        dummy_lora_requests = []
-        dummy_lora_requests_per_seq = []
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
         if self.lora_config:
             assert self.lora_manager is not None
             with self.lora_manager.dummy_lora_cache():
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 258f31de1..3d52fd71e 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -99,8 +99,8 @@ class WorkerWrapperBase:
     """
 
     def __init__(self,
-                 worker_module_name=None,
-                 worker_class_name=None,
+                 worker_module_name: str,
+                 worker_class_name: str,
                  trust_remote_code: bool = False) -> None:
         self.worker_module_name = worker_module_name
         self.worker_class_name = worker_class_name
-- 
GitLab


From 81fbb3655f37e2b3ccbe0e17276c5d813b886417 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Jun 2024 19:29:42 +0800
Subject: [PATCH 054/376] [CI/Build] Test both text and token IDs in batched
 OpenAI Completions API (#5568)

---
 tests/entrypoints/test_openai_server.py | 88 +++++++++++++------------
 1 file changed, 45 insertions(+), 43 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index d66b9b0fd..c22a675ff 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -655,50 +655,52 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test simple list
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(batch.choices) == 2
-    assert batch.choices[0].text == batch.choices[1].text
-
-    # test n = 2
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        n=2,
-        max_tokens=5,
-        temperature=0.0,
-        extra_body=dict(
-            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-            # for official client.
-            use_beam_search=True),
-    )
-    assert len(batch.choices) == 4
-    assert batch.choices[0].text != batch.choices[
-        1].text, "beam search should be different"
-    assert batch.choices[0].text == batch.choices[
-        2].text, "two copies of the same prompt should be the same"
-    assert batch.choices[1].text == batch.choices[
-        3].text, "two copies of the same prompt should be the same"
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
 
-    # test streaming
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-    )
-    texts = [""] * 2
-    async for chunk in batch:
-        assert len(chunk.choices) == 1
-        choice = chunk.choices[0]
-        texts[choice.index] += choice.text
-    assert texts[0] == texts[1]
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
 
 
 @pytest.mark.asyncio
-- 
GitLab


From e691918e3bd75a05bc473c77577c494aa6442640 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Sat, 15 Jun 2024 23:59:36 +0900
Subject: [PATCH 055/376] [misc] Do not allow to use lora with chunked prefill.
 (#5538)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index d9e4a619e..54f36e1d6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1092,6 +1092,8 @@ class LoRAConfig:
                 "Due to limitations of the custom LoRA CUDA kernel, "
                 "max_num_batched_tokens must be <= 65528 when "
                 "LoRA is enabled.")
+        if scheduler_config.chunked_prefill_enabled:
+            raise ValueError("LoRA is not supported with chunked prefill yet.")
 
 
 @dataclass
-- 
GitLab


From d919ecc771ece6995a949c3d4284c534a2bd0890 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Sat, 15 Jun 2024 13:38:16 -0400
Subject: [PATCH 056/376] add gptq_marlin test for bug report
 https://github.com/vllm-project/vllm/issues/5088 (#5145)

---
 tests/models/test_gptq_marlin.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index e30100d9b..4abbc41c9 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -40,6 +40,9 @@ MODELS = [
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
     # 8-bit, act_order==True, group_size=32
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
+
+    # 4-bit, act_order==True, group_size=128
+    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
 ]
 
 
-- 
GitLab


From 1c0afa13c57766641e75172ff1cac2e09f79a3b9 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sat, 15 Jun 2024 16:30:51 -0700
Subject: [PATCH 057/376] [BugFix] Don't start a Ray cluster when not using Ray
 (#5570)

---
 vllm/config.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 54f36e1d6..c0d294ce9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -616,9 +616,14 @@ class ParallelConfig:
                                      "required for multi-node inference")
                 backend = "ray"
             elif ray_found:
-                from ray.util import get_current_placement_group
-                if self.placement_group or get_current_placement_group():
+                if self.placement_group:
                     backend = "ray"
+                else:
+                    from ray import is_initialized as ray_is_initialized
+                    if ray_is_initialized():
+                        from ray.util import get_current_placement_group
+                        if get_current_placement_group():
+                            backend = "ray"
             self.distributed_executor_backend = backend
             logger.info("Defaulting to use %s for distributed inference",
                         backend)
-- 
GitLab


From 3ce2c050dd919542ef5355635edf71349ea597f2 Mon Sep 17 00:00:00 2001
From: zifeitong <zifei.tong@parasail.io>
Date: Sat, 15 Jun 2024 16:57:54 -0700
Subject: [PATCH 058/376] [Fix] Correct OpenAI batch response format (#5554)

---
 vllm/entrypoints/openai/protocol.py  | 13 ++++++++++++-
 vllm/entrypoints/openai/run_batch.py | 17 +++++++++++++----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 3b56ad63f..b57d79859 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -672,6 +672,17 @@ class BatchRequestInput(OpenAIBaseModel):
     body: Union[ChatCompletionRequest, ]
 
 
+class BatchResponseData(OpenAIBaseModel):
+    # HTTP status code of the response.
+    status_code: int = 200
+
+    # An unique identifier for the API request.
+    request_id: str
+
+    # The body of the response.
+    body: Union[ChatCompletionResponse, ]
+
+
 class BatchRequestOutput(OpenAIBaseModel):
     """
     The per-line object of the batch output and error files
@@ -683,7 +694,7 @@ class BatchRequestOutput(OpenAIBaseModel):
     # inputs.
     custom_id: str
 
-    response: Optional[ChatCompletionResponse]
+    response: Optional[BatchResponseData]
 
     # For requests that failed with a non-HTTP error, this will contain more
     # information on the cause of the failure.
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 91e567924..b0c0f4ad2 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -10,7 +10,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                               BatchRequestOutput,
-                                              ChatCompletionResponse)
+                                              BatchResponseData,
+                                              ChatCompletionResponse,
+                                              ErrorResponse)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
@@ -77,20 +79,27 @@ async def run_request(chat_serving: OpenAIServingChat,
                       request: BatchRequestInput) -> BatchRequestOutput:
     chat_request = request.body
     chat_response = await chat_serving.create_chat_completion(chat_request)
+
     if isinstance(chat_response, ChatCompletionResponse):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
-            response=chat_response,
+            response=BatchResponseData(
+                body=chat_response, request_id=f"vllm-batch-{random_uuid()}"),
             error=None,
         )
-    else:
+    elif isinstance(chat_response, ErrorResponse):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
-            response=None,
+            response=BatchResponseData(
+                status_code=chat_response.code,
+                request_id=f"vllm-batch-{random_uuid()}"),
             error=chat_response,
         )
+    else:
+        raise ValueError("Request must not be sent in stream mode")
+
     return batch_output
 
 
-- 
GitLab


From f31c1f90e381967d25591a8928782d8a6a13693e Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Sun, 16 Jun 2024 00:48:02 -0700
Subject: [PATCH 059/376] Add basic correctness 2 GPU tests to 4 GPU pipeline
 (#5518)

---
 .buildkite/test-pipeline.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6a2932db9..6439a315e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -32,7 +32,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
 
-- label: Distributed Tests
+- label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -50,12 +50,16 @@ steps:
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
-- label: Distributed Tests (Multiple Groups)
+- label: Distributed Tests (4 GPUs)
   #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
   - pytest -v -s distributed/test_pynccl.py
+  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
+  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
-- 
GitLab


From 4a6769053ab2616f7f490e6ec5b8241e76ef0c2a Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sun, 16 Jun 2024 10:07:34 -0400
Subject: [PATCH 060/376] [CI][BugFix] Flip is_quant_method_supported condition
 (#5577)

---
 tests/quantization/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 0c92d565d..29085916a 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -10,5 +10,5 @@ def is_quant_method_supported(quant_method: str) -> bool:
 
     capability = torch.cuda.get_device_capability()
     capability = capability[0] * 10 + capability[1]
-    return (capability <
+    return (capability >=
             QUANTIZATION_METHODS[quant_method].get_min_capability())
-- 
GitLab


From f07d5133202c08899eb5f51134af0f43b7791a33 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 16 Jun 2024 16:07:01 -0700
Subject: [PATCH 061/376] [build][misc] limit numpy version (#5582)

---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index bf9987e3a..32e2ebe8c 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -2,7 +2,7 @@ cmake >= 3.21
 ninja  # For faster builds.
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
-numpy
+numpy < 2.0.0
 requests
 py-cpuinfo
 transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
-- 
GitLab


From 845a3f26f9706acafe8fa45ae452846d8cc3b97f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 16 Jun 2024 19:08:01 -0700
Subject: [PATCH 062/376] [Doc] add debugging tips for crash and multi-node
 debugging (#5581)

---
 docs/source/getting_started/debugging.rst | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index ff37f4e62..a22bba147 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -24,6 +24,8 @@ If you have already taken care of the above issues, but the vLLM instance still
 
 With more logging, hopefully you can find the root cause of the issue.
 
+If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
+
 Here are some common issues that can cause hangs:
 
 - **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
@@ -31,15 +33,26 @@ Here are some common issues that can cause hangs:
 
 .. code-block:: python
 
-    # save it as `test.py` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
-    # adjust `--nproc-per-node` to the number of GPUs you want to use.
     import torch
     import torch.distributed as dist
     dist.init_process_group(backend="nccl")
-    data = torch.FloatTensor([1,] * 128).to(f"cuda:{dist.get_rank()}")
+    local_rank = dist.get_rank() % torch.cuda.device_count()
+    data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}")
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
     torch.cuda.synchronize()
     value = data.mean().item()
     assert value == dist.get_world_size()
 
+.. tip::
+
+    Save the script as ``test.py``.
+    
+    If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use.
+    
+    If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``:
+  
+    - is the correct IP address of the master node
+    - is reachable from all nodes
+    - is set before running the script.
+
 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
-- 
GitLab


From e2b85cf86a522e734a38b1d0314cfe9625003ef9 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Sun, 16 Jun 2024 23:48:06 -0700
Subject: [PATCH 063/376] Fix w8a8 benchmark and add Llama-3-8B (#5562)

---
 .../cutlass_benchmarks/w8a8_benchmarks.py     | 21 ++++++++++++-------
 .../cutlass_benchmarks/weight_shapes.py       |  6 ++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 182105f0b..523e970c2 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -46,7 +46,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
 # impl
 
 
-def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                     scale_b: torch.tensor,
                     out_dtype: torch.dtype) -> torch.tensor:
     return torch.mm(a, b)
@@ -115,7 +115,7 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     timers.append(
         bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
                  b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_i8_impl,
+                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
                  "pytorch_bf16_bf16_bf16_matmul-no-scales"))
 
     # cutlass impl
@@ -136,6 +136,13 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     timers = []
 
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
+
     # pytorch impl: bf16 output, without fp8 fast accum
     timers.append(
         bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
@@ -160,14 +167,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     # cutlass impl: bf16 output
     timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.bfloat16, label, sub_label, cutlass_impl,
-                 "cutlass_fp8_fp8_bf16_scaled_mm"))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
     # cutlass impl: fp16 output
     timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.float16, label, sub_label, cutlass_impl,
-                 "cutlass_fp8_fp8_fp16_scaled_mm"))
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
     return timers
 
 
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 7ad4a53d3..25ec9d602 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -22,6 +22,12 @@ WEIGHT_SHAPES = {
         ([4096, 22016], 1),
         ([11008, 4096], 0),
     ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
     "meta-llama/Llama-2-13b-hf": [
         ([5120, 15360], 1),
         ([5120, 5120], 0),
-- 
GitLab


From 9333fb8eb9ed6a62d33ef4d56d589f83a0f19233 Mon Sep 17 00:00:00 2001
From: Amit Garg <gargamit@microsoft.com>
Date: Mon, 17 Jun 2024 09:04:14 -0700
Subject: [PATCH 064/376] [Model] Rename Phi3 rope scaling type (#5595)

---
 vllm/config.py                                |  5 ++++-
 .../model_executor/layers/rotary_embedding.py | 19 ++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c0d294ce9..552d5033f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1287,7 +1287,10 @@ def _get_and_verify_max_len(
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
-    if rope_scaling is not None and rope_scaling["type"] != "su":
+    # The correct one should be "longrope", kept "su" here
+    # to be backward compatible
+    if rope_scaling is not None and rope_scaling["type"] != "su" \
+        and rope_scaling["type"] != "longrope":
         if disable_sliding_window:
             # TODO(robertgshaw): Find a model that supports rope_scaling
             # with sliding window to see if this case should be allowed.
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 792c47293..5a4940acb 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -467,7 +467,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
         return cache
 
 
-class Phi3SuScaledRotaryEmbedding(nn.Module):
+class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
     """Phi3 family of models scaled rotary embedding.
 
     Based on the original RotaryEmbedding implementation.
@@ -491,11 +491,12 @@ class Phi3SuScaledRotaryEmbedding(nn.Module):
 
         if rotary_dim != head_size:
             raise ValueError(
-                f"`Phi3SuScaledRotaryEmbedding` does not support rotary_dim != \
-                    head_size ({rotary_dim}!={head_size}).")
+                f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \
+                    rotary_dim != head_size ({rotary_dim}!={head_size}).")
         if is_neox_style is False:
             raise ValueError(
-                "`Phi3SuScaledRotaryEmbedding` only supports neox_style.")
+                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
+            )
 
         self.head_size = head_size
         self.max_position_embeddings = max_position_embeddings
@@ -608,7 +609,9 @@ def get_rope(
                                      is_neox_style, dtype)
     else:
         scaling_type = rope_scaling["type"]
-        if scaling_type != "su":
+        # The correct one should be "longrope" but keep "su" here
+        # for backward compatible
+        if scaling_type != "su" and scaling_type != "longrope":
             scaling_factor = rope_scaling["factor"]
         if scaling_type == "linear":
             rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
@@ -633,7 +636,9 @@ def get_rope(
                                                     base, is_neox_style,
                                                     scaling_factor, dtype,
                                                     **extra_kwargs)
-        elif scaling_type == "su":
+        # The correct one should be "longrope" but keep "su" here
+        # for backward compatible
+        elif scaling_type == "su" or scaling_type == "longrope":
             short_factor = rope_scaling["short_factor"]
             long_factor = rope_scaling["long_factor"]
             original_max_position = rope_scaling[
@@ -643,7 +648,7 @@ def get_rope(
                 for k, v in rope_scaling.items()
                 if k in ("short_mscale", "long_mscale")
             }
-            rotary_emb = Phi3SuScaledRotaryEmbedding(
+            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
                 head_size, rotary_dim, max_position, original_max_position,
                 base, is_neox_style, dtype, short_factor, long_factor,
                 **extra_kwargs)
-- 
GitLab


From 9e74d9d003d546c17dca472c3f4b48be651f1d7c Mon Sep 17 00:00:00 2001
From: Charles Riggins <liqianchen123@foxmail.com>
Date: Tue, 18 Jun 2024 00:05:33 +0800
Subject: [PATCH 065/376] Correct alignment in the seq_len diagram. (#5592)

Co-authored-by: Liqian Chen <liqian.chen@deeplang.ai>
---
 vllm/attention/backends/flash_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 300bab728..1c48e2a0b 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -83,7 +83,7 @@ class FlashAttentionMetadata(AttentionMetadata):
     # |---------------- N iteration ---------------------|
     # |- tokenA -|......................|-- newTokens ---|
     # |---------- context_len ----------|
-    # |-------------------- seq_len ----------------------|
+    # |-------------------- seq_len ---------------------|
     #                                   |-- query_len ---|
 
     # Maximum query length in the batch. None for decoding.
-- 
GitLab


From 890d8d960bb441b4ac46588492db7f16b6da78d7 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 17 Jun 2024 12:32:48 -0400
Subject: [PATCH 066/376] [Kernel] `compressed-tensors` marlin 24 support
 (#5435)

---
 tests/quantization/test_compressed_tensors.py |  23 ++-
 .../compressed_tensors/compressed_tensors.py  |  48 ++++---
 .../compressed_tensors/schemes/__init__.py    |   2 +
 .../schemes/compressed_tensors_w4a16_24.py    | 134 ++++++++++++++++++
 .../quantization/compressed_tensors/utils.py  |   8 ++
 5 files changed, 196 insertions(+), 19 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 5670498f2..611c6b8b7 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -9,7 +9,8 @@ import torch
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16,
-    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
@@ -51,8 +52,7 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
 
 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
-    with vllm_runner(model_path, enforce_eager=True,
-                     dtype=torch.float16) as llm:
+    with vllm_runner(model_path, dtype=torch.float16) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
@@ -83,3 +83,20 @@ def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
         assert qkv_proj.weight_packed.dtype is torch.int32
         assert qkv_proj.weight_scale.dtype is torch.float16
         assert qkv_proj.weight_packed.pack_factor == 8
+
+
+def test_compressed_tensors_w4a16_marlin24(vllm_runner):
+    model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
+    with vllm_runner(model_path) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
+        assert qkv_proj.weight_packed.dtype is torch.int32
+
+        sampling_params = SamplingParams()
+        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index e134a26ef..92a84b3c0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -8,16 +8,20 @@ from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme, CompressedTensorsW4A16,
-    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)
+    CompressionFormat, QuantizationArgs, QuantizationStrategy,
+    find_first_name_or_class_match)
 
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str]):
+    def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str],
+                 quant_format: str):
         self.ignore = ignore
         self.layer_quant_details = layer_quant_details
+        self.quant_format = quant_format
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -46,6 +50,7 @@ class CompressedTensorsConfig(QuantizationConfig):
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         layer_quant_details: Dict[str, Any] = dict()
         ignore: List[str] = config.get("ignore", None)
+        quant_format: str = config.get("format", None)
 
         # The quant_config has multiple config_groups, each containing
         # an input_activations key with details about how the activations are
@@ -69,7 +74,9 @@ class CompressedTensorsConfig(QuantizationConfig):
                 except Exception:
                     layer_quant_details[target]["input_activations"] = None
 
-        return cls(layer_quant_details=layer_quant_details, ignore=ignore)
+        return cls(layer_quant_details=layer_quant_details,
+                   ignore=ignore,
+                   quant_format=quant_format)
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -110,17 +117,26 @@ class CompressedTensorsConfig(QuantizationConfig):
                     input_quant: BaseModel) -> "CompressedTensorsScheme":
 
         if self._is_w4a16(weight_quant, input_quant):
-            return CompressedTensorsW4A16(num_bits=weight_quant.num_bits,
-                                          strategy=weight_quant.strategy,
-                                          group_size=weight_quant.group_size)
-
-        if self._is_static_tensor_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8StaticTensor()
-
-        if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8DynamicToken()
-
-        raise NotImplementedError("Scheme not supported.")
+            if self.quant_format == CompressionFormat.marlin_24.value:
+                return CompressedTensorsW4A16Sparse24(
+                    strategy=weight_quant.strategy,
+                    num_bits=weight_quant.num_bits,
+                    group_size=weight_quant.group_size)
+            if self.quant_format == CompressionFormat.pack_quantized.value:
+                return CompressedTensorsW4A16(
+                    num_bits=weight_quant.num_bits,
+                    strategy=weight_quant.strategy,
+                    group_size=weight_quant.group_size)
+
+        if self.quant_format == CompressionFormat.int_quantized.value:
+            if self._is_static_tensor_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8StaticTensor()
+
+            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8DynamicToken()
+
+        raise NotImplementedError(
+            "No compressed-tensors compatible scheme was found.")
 
     def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme":
 
@@ -165,9 +181,9 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
         scheme = self.quantization_config.get_scheme(layer=layer)
         scheme.create_weights(
             layer=layer,
+            input_size=input_size,
             input_size_per_partition=input_size_per_partition,
             output_partition_sizes=output_partition_sizes,
-            input_size=input_size,
             output_size=output_size,
             params_dtype=params_dtype,
             weight_loader=weight_loader)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index dc84d0008..3c95aa11f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -2,6 +2,8 @@ from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
 from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
+from .compressed_tensors_w4a16_24 import (  # noqa: F401
+    CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
     CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
new file mode 100644
index 000000000..d7e04ddb8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -0,0 +1,134 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW4A16Sparse24"]
+
+
+class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None):
+        self.strategy = strategy
+        self.group_size = group_size
+        self.num_bits = num_bits
+        self.tile_size = 16
+
+        if self.strategy == "group" and self.group_size is None:
+            raise ValueError(
+                "group_size must be given when using strategy group")
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        pack_factor = 32 // self.num_bits
+        output_size_per_partition = sum(output_partition_sizes)
+
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.tile_size // 2,
+                output_size_per_partition * self.tile_size // pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": pack_factor,
+                "marlin_tile_size": self.tile_size,
+                "weight_loader": weight_loader
+            },
+        )
+
+        layer.register_parameter("weight_packed", qweight)
+
+        input_groups = (1 if self.group_size is None else
+                        input_size_per_partition // self.group_size)
+
+        scales = Parameter(
+            torch.empty(
+                input_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                "output_dim": 1,
+                "input_dim": None if input_groups == 1 else 0,
+                "weight_loader": weight_loader
+            },
+        )
+        layer.register_parameter("scale_packed", scales)
+
+        weight_shape = Parameter(torch.empty(2, dtype=torch.int64),
+                                 requires_grad=False)
+
+        layer.register_parameter("weight_shape", weight_shape)
+        set_weight_attrs(weight_shape, {"weight_loader": weight_loader})
+
+        meta = Parameter(
+            torch.empty(
+                input_size_per_partition // 8 // 2 // 2,
+                output_size_per_partition * 2,
+                dtype=torch.int16,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            meta,
+            {
+                "input_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": 1,
+                "output_dim": 1,
+                "marlin_tile_size": 2,
+                "weight_loader": weight_loader
+            },
+        )
+        layer.register_parameter("meta", meta)
+
+        max_workspace_size = (
+            output_size_per_partition //
+            GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL
+        workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int),
+                              requires_grad=False)
+        layer.workspace = workspace
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        qweight = layer.weight_packed
+        meta = layer.meta
+        scales = layer.scale_packed
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
+                                            workspace, self.num_bits, size_m,
+                                            size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+        return output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index fcc664910..b2bec9b60 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -6,6 +6,14 @@ from pydantic import BaseModel, Field
 from torch.nn import Module
 
 
+class CompressionFormat(Enum):
+    dense = "dense"
+    sparse_bitmask = "sparse-bitmask"
+    int_quantized = "int-quantized"
+    pack_quantized = "pack-quantized"
+    marlin_24 = "marlin-24"
+
+
 class QuantizationType(str, Enum):
     """
     Enum storing quantization type options
-- 
GitLab


From 1f12122b1714c855c02699775bcd2fb2b34f2577 Mon Sep 17 00:00:00 2001
From: zhyncs <me@zhyncs.com>
Date: Tue, 18 Jun 2024 00:40:35 +0800
Subject: [PATCH 067/376] [Misc] use AutoTokenizer for benchmark serving when
 vLLM not installed (#5588)

---
 benchmarks/backend_request_func.py | 29 ++++++++++++++++++++++++++++-
 benchmarks/benchmark_serving.py    |  5 ++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 52386b8cd..4350b96b0 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -4,10 +4,13 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import aiohttp
+import huggingface_hub.constants
 from tqdm.asyncio import tqdm
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
@@ -388,6 +391,30 @@ def remove_prefix(text: str, prefix: str) -> str:
     return text
 
 
+def get_model(pretrained_model_name_or_path: str):
+    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
+        from modelscope import snapshot_download
+    else:
+        from huggingface_hub import snapshot_download
+
+    model_path = snapshot_download(
+        model_id=pretrained_model_name_or_path,
+        local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+        ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+    return model_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str, trust_remote_code: bool
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+            pretrained_model_name_or_path):
+        pretrained_model_name_or_path = get_model(
+            pretrained_model_name_or_path)
+    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
+                                         trust_remote_code=trust_remote_code)
+
+
 ASYNC_REQUEST_FUNCS = {
     "tgi": async_request_tgi,
     "vllm": async_request_openai_completions,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c136ee572..eef03e7d8 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -39,7 +39,10 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
-from vllm.transformers_utils.tokenizer import get_tokenizer
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
 
 
 @dataclass
-- 
GitLab


From 728c4c8a063c25e7a20d6eda20a3f30873bda4c6 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 18 Jun 2024 02:01:25 +0800
Subject: [PATCH 068/376] [Hardware][Intel GPU] Add Intel GPU(XPU) inference
 backend (#3814)

Co-authored-by: Jiang Li <jiang1.li@intel.com>
Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com>
Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
---
 .buildkite/run-xpu-test.sh                    |  14 +
 .buildkite/test-template.j2                   |   5 +
 Dockerfile.xpu                                |  22 +
 benchmarks/benchmark_latency.py               |   2 +-
 benchmarks/benchmark_throughput.py            |   2 +-
 .../getting_started/xpu-installation.rst      |  61 +++
 docs/source/index.rst                         |   1 +
 requirements-xpu.txt                          |  11 +
 setup.py                                      |   8 +
 vllm/_custom_ops.py                           |   3 +-
 vllm/_ipex_ops.py                             | 241 ++++++++++
 vllm/attention/backends/ipex_attn.py          | 355 +++++++++++++++
 vllm/attention/selector.py                    |  15 +-
 vllm/config.py                                |   4 +-
 vllm/distributed/parallel_state.py            |   2 +-
 vllm/engine/arg_utils.py                      |  11 +-
 vllm/engine/async_llm_engine.py               |  11 +
 vllm/engine/llm_engine.py                     |   8 +
 vllm/executor/ray_utils.py                    |   4 +-
 vllm/executor/ray_xpu_executor.py             | 401 +++++++++++++++++
 vllm/executor/xpu_executor.py                 |  98 ++++
 vllm/model_executor/custom_op.py              |   8 +-
 vllm/model_executor/layers/activation.py      |  35 ++
 vllm/model_executor/layers/layernorm.py       |  24 +
 .../model_executor/layers/rotary_embedding.py |  23 +
 .../layers/vocab_parallel_embedding.py        |   2 +-
 vllm/utils.py                                 |  31 +-
 vllm/worker/cache_engine.py                   |   7 +-
 vllm/worker/worker.py                         |   3 +-
 vllm/worker/xpu_model_runner.py               | 417 ++++++++++++++++++
 vllm/worker/xpu_worker.py                     | 193 ++++++++
 31 files changed, 1998 insertions(+), 24 deletions(-)
 create mode 100644 .buildkite/run-xpu-test.sh
 create mode 100644 Dockerfile.xpu
 create mode 100644 docs/source/getting_started/xpu-installation.rst
 create mode 100644 requirements-xpu.txt
 create mode 100644 vllm/_ipex_ops.py
 create mode 100644 vllm/attention/backends/ipex_attn.py
 create mode 100644 vllm/executor/ray_xpu_executor.py
 create mode 100644 vllm/executor/xpu_executor.py
 create mode 100644 vllm/worker/xpu_model_runner.py
 create mode 100644 vllm/worker/xpu_worker.py

diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
new file mode 100644
index 000000000..22a7e7693
--- /dev/null
+++ b/.buildkite/run-xpu-test.sh
@@ -0,0 +1,14 @@
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t xpu-test -f Dockerfile.xpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f xpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 4a20a462b..3bd1e90c2 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -45,6 +45,11 @@ steps:
       queue: intel
     command: bash .buildkite/run-cpu-test.sh
 
+  - label: "XPU Test"
+    agents:
+      queue: intel
+    command: bash .buildkite/run-xpu-test.sh
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
new file mode 100644
index 000000000..c39e55167
--- /dev/null
+++ b/Dockerfile.xpu
@@ -0,0 +1,22 @@
+FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    rm /etc/apt/sources.list.d/intel-graphics.list && \
+    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
+RUN apt-get update  -y \
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-xpu.txt
+
+RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
+
+CMD ["/bin/bash"]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 9937f8333..11d1bf7a4 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -191,7 +191,7 @@ if __name__ == '__main__':
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu", "tpu"],
+        choices=["cuda", "cpu", "tpu", "xpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument('--block-size',
                         type=int,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 48dfce428..ed65002bc 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -349,7 +349,7 @@ if __name__ == "__main__":
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu", "tpu"],
+        choices=["cuda", "cpu", "tpu", "xpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument(
         "--enable-prefix-caching",
diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
new file mode 100644
index 000000000..4f0d2da25
--- /dev/null
+++ b/docs/source/getting_started/xpu-installation.rst
@@ -0,0 +1,61 @@
+.. _installation_xpu:
+
+Installation with XPU
+========================
+
+vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+
+Table of contents:
+
+#. :ref:`Requirements <xpu_backend_requirements>`
+#. :ref:`Quick start using Dockerfile <xpu_backend_quick_start_dockerfile>`
+#. :ref:`Build from source <build_xpu_backend_from_source>`
+
+.. _xpu_backend_requirements:
+
+Requirements
+------------
+
+* OS: Linux
+* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP)
+* OneAPI requirements: oneAPI 2024.1 
+
+.. _xpu_backend_quick_start_dockerfile:
+
+Quick start using Dockerfile
+----------------------------
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+    $ docker run -it \
+                 --rm \
+                 --network=host \
+                 --device /dev/dri \
+                 -v /dev/dri/by-path:/dev/dri/by-path \
+                 vllm-xpu-env
+
+.. _build_xpu_backend_from_source:
+
+Build from source
+-----------------
+
+- First, install required driver and intel OneAPI 2024.1.
+
+- Second, install Python packages for vLLM XPU backend building:
+
+.. code-block:: console
+
+    $ pip install --upgrade pip
+    $ pip install -v -r requirements-xpu.txt 
+
+- Finally, build and install vLLM XPU backend: 
+
+.. code-block:: console
+
+    $ VLLM_TARGET_DEVICE=xpu python setup.py install
+
+.. note::
+    - FP16 is the default data type in the current XPU backend. The BF16 data
+      type will be supported in the future.
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f5d862759..8795a865c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -66,6 +66,7 @@ Documentation
    getting_started/cpu-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
+   getting_started/xpu-installation
    getting_started/quickstart
    getting_started/debugging
    getting_started/examples/examples_index
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
new file mode 100644
index 000000000..48d899ec7
--- /dev/null
+++ b/requirements-xpu.txt
@@ -0,0 +1,11 @@
+# Common dependencies
+-r requirements-common.txt
+
+setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
+
+torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
+intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
+
+triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+
diff --git a/setup.py b/setup.py
index 12a704e08..b2ae6def8 100644
--- a/setup.py
+++ b/setup.py
@@ -233,6 +233,10 @@ def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
+def _is_xpu() -> bool:
+    return VLLM_TARGET_DEVICE == "xpu"
+
+
 def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
@@ -337,6 +341,8 @@ def get_vllm_version() -> str:
         version += "+tpu"
     elif _is_cpu():
         version += "+cpu"
+    elif _is_xpu():
+        version += "+xpu"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -386,6 +392,8 @@ def get_requirements() -> List[str]:
         requirements = _read_requirements("requirements-tpu.txt")
     elif _is_cpu():
         requirements = _read_requirements("requirements-cpu.txt")
+    elif _is_xpu():
+        requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2f84b8bde..ab2a67950 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -373,7 +373,8 @@ def reshape_and_cache_flash(
                                                    kv_cache_dtype)
 
 
-def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
+def copy_blocks(key_caches: List[torch.Tensor],
+                value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
new file mode 100644
index 000000000..1e60e0848
--- /dev/null
+++ b/vllm/_ipex_ops.py
@@ -0,0 +1,241 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError as e:
+    logger.warning("Import error msg: %s", e.msg)
+
+
+class ipex_ops:
+
+    @staticmethod
+    def _reshape_activation_tensor(
+            x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        num = x.size(0)
+        d = x.size(1) // 2
+        x = x.reshape(num, 2, d)
+        x1, x2 = torch.chunk(x, chunks=2, dim=1)
+        x1 = x1.reshape(num, d)
+        x2 = x2.reshape(num, d)
+        return x1, x2
+
+    def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.silu_mul(x1, x2, out)
+
+    def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.gelu_mul(x1, x2, out, "none")
+
+    def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
+
+    def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+        out.copy_(torch.nn.functional.gelu(x))
+
+    def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+        out.copy_(torch.nn.functional.gelu(x))
+
+    def paged_attention_v1(
+        out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        kv_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            device=query.device,
+            dtype=torch.int32,
+        ).view(num_kv_heads,
+               1).repeat_interleave(num_queries_per_tokens).flatten()
+        # todo: ipex will refactor namespace
+        torch.xpu.paged_attention_v1(out, query.contiguous(),
+                                     key_cache.view_as(value_cache),
+                                     value_cache, head_mapping, scale,
+                                     block_tables, context_lens, block_size,
+                                     max_context_len, alibi_slopes)
+
+    def paged_attention_v2(
+        out: torch.Tensor,
+        exp_sum: torch.Tensor,
+        max_logits: torch.Tensor,
+        tmp_out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        kv_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            dtype=torch.int32,
+            device=query.device,
+        ).view(num_kv_heads,
+               1).repeat_interleave(num_queries_per_tokens).flatten()
+        # todo: ipex will refactor namespace
+        torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out,
+                                     query.contiguous(),
+                                     key_cache.view_as(value_cache),
+                                     value_cache, head_mapping, block_tables,
+                                     context_lens, scale, block_size,
+                                     max_context_len, alibi_slopes)
+
+    def rotary_embedding(
+        positions: torch.Tensor,  # [batch_size, seq_len]
+        query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
+        key: torch.Tensor,  # [batch_size, seq_len, num_kv_heads*head_size]
+        head_size: int,
+        cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
+        is_neox: bool,
+    ) -> None:
+        if positions.dim() == 1:
+            positions = positions.unsqueeze(0)
+            query = query.unsqueeze(0)
+            key = key.unsqueeze(0)
+
+        rotary_dim = cos_sin_cache.size(1)
+        query = query.view(*query.shape[:-1], -1, head_size)
+        key = key.view(*key.shape[:-1], -1, head_size)
+
+        query_rot = query[..., :rotary_dim]
+        key_rot = key[..., :rotary_dim]
+
+        cos_sin = cos_sin_cache[positions.long()]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        if is_neox:
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
+                                             rotary_dim, is_neox, positions)
+
+    def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+                                 key: torch.Tensor, head_size: int,
+                                 cos_sin_cache: torch.Tensor, is_neox: bool,
+                                 rot_dim: int,
+                                 cos_sin_cache_offsets: torch.Tensor) -> None:
+        if positions.dim() == 1:
+            positions = positions.unsqueeze(0)
+            query = query.unsqueeze(0)
+            key = key.unsqueeze(0)
+        cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions)
+        rotary_dim = cos_sin_cache.size(1)
+        query = query.view(*query.shape[:-1], -1, head_size)
+        key = key.view(*key.shape[:-1], -1, head_size)
+
+        query_rot = query[..., :rotary_dim]
+        key_rot = key[..., :rotary_dim]
+
+        cos_sin = cos_sin_cache[torch.add(positions,
+                                          cos_sin_cache_offsets).long()]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        if is_neox:
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
+                                             rotary_dim, is_neox, positions)
+
+    def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
+                 epsilon: float) -> None:
+        tmp = ipex.llm.functional.rms_norm(input, weight, epsilon)
+        out.copy_(tmp)
+
+    def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                           weight: torch.Tensor, epsilon: float) -> None:
+        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
+                                               epsilon, True)
+        input.copy_(tmp)
+
+    def varlen_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+        seqlen_q: torch.Tensor,
+        seqlen_k: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        pdropout: float,
+        softmax_scale: float,
+        zero_tensors: bool,
+        is_causal: bool,
+        return_softmax: bool,
+        gen_: torch.Generator,
+    ) -> None:
+        ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q,
+                                             seqlen_k, max_seqlen_q,
+                                             max_seqlen_k, pdropout,
+                                             softmax_scale, zero_tensors,
+                                             is_causal, return_softmax, gen_)
+
+    def reshape_and_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        kv_scale: float,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        ipex.llm.modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache, slot_mapping)
+
+    @staticmethod
+    def copy_blocks(key_caches: List[torch.Tensor],
+                    value_caches: List[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.copy_blocks(key_caches, value_caches, block_mapping)
+
+    def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.swap_blocks(src, dst, block_mapping)
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
new file mode 100644
index 000000000..f09b24f2a
--- /dev/null
+++ b/vllm/attention/backends/ipex_attn.py
@@ -0,0 +1,355 @@
+""" Attention layer with torch scaled_dot_product_attention
+    and PagedAttention."""
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+from vllm._ipex_ops import ipex_ops
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata)
+from vllm.attention.ops.paged_attn import (PagedAttention,
+                                           PagedAttentionMetadata)
+
+_PARTITION_SIZE = 512
+
+
+class IpexAttnBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "ipex-attn"
+
+    @staticmethod
+    def get_impl_cls() -> Type["IpexAttnBackendImpl"]:
+        return IpexAttnBackendImpl
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "IpexAttnMetadata":
+        return IpexAttnMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                 num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+
+@dataclass
+class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata):
+    """Metadata for IpexAttnBackend.
+    """
+    # Currently, input sequences can only contain all prompts
+    # or all decoding. True if all sequences are prompts.
+    is_prompt: bool
+    slot_mapping: torch.Tensor
+    seq_lens: Optional[List[int]]
+    seqlen_q: Optional[torch.Tensor]
+    max_seqlen: Optional[int]
+
+    def __post_init__(self):
+        # Set during the execution of the first attention op.
+        # It is a list because it is needed to set per prompt
+        # when alibi slopes is used. It is because of the limitation
+        # from xformer API.
+        # will not appear in the __repr__ and __init__
+        self.attn_bias: Optional[List[torch.Tensor]] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["IpexAttnMetadata"]:
+        # Currently chunked prefill is not supported
+        if self.num_decode_tokens == 0:
+            assert self.num_prefills > 0
+            return self
+
+        return None
+
+    @property
+    def decode_metadata(self) -> Optional["IpexAttnMetadata"]:
+        # Currently chunked prefill is not supported
+        if self.num_prefills > 0:
+            assert self.num_decode_tokens == 0
+            return None
+
+        return self
+
+
+class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        assert blocksparse_params is None, ValueError(
+            "Torch SPDA does not support block-sparse attention.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = sliding_window
+        self.kv_cache_dtype = kv_cache_dtype
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.need_mask = (self.alibi_slopes is not None
+                          or self.sliding_window is not None)
+
+        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {supported_head_sizes}.")
+        if kv_cache_dtype != "auto":
+            raise NotImplementedError(
+                "IPEX backend does not support FP8 KV cache. "
+                "Please use xFormers backend instead.")
+
+    def split_kv_cache(
+        self,
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = 1
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
+                                   -1, x)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
+        return key_cache, value_cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: Optional[torch.Tensor],
+        attn_metadata: IpexAttnMetadata,  # type: ignore
+        kv_scale: float = 1.0,
+    ) -> torch.Tensor:
+        """Forward pass with IPEX varlen_attention and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert kv_scale == 1.0
+        num_tokens, hidden_size = query.shape
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        if kv_cache is not None:
+            key_cache, value_cache = self.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+            ipex_ops.reshape_and_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping.flatten(),
+                self.kv_cache_dtype,
+                kv_scale,
+            )
+
+        if attn_metadata.is_prompt:
+            assert attn_metadata.seq_lens is not None
+            if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
+                if self.num_kv_heads != self.num_heads:
+                    key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
+                    value = value.repeat_interleave(self.num_queries_per_kv,
+                                                    dim=1)
+
+                if attn_metadata.attn_bias is None:
+                    if self.alibi_slopes is not None:
+                        att_masks = _make_alibi_bias(
+                            self.alibi_slopes, query.dtype,
+                            attn_metadata.seq_lens)  # type: ignore
+                    elif self.sliding_window is not None:
+                        att_masks = _make_sliding_window_bias(
+                            attn_metadata.seq_lens, self.sliding_window,
+                            query.dtype)  # type: ignore
+                    else:
+                        att_masks = _make_sliding_window_bias(
+                            attn_metadata.seq_lens, None, dtype=query.dtype)
+                    attn_metadata.attn_bias = att_masks
+
+                output = torch.empty(
+                    (num_tokens, self.num_heads, self.head_size),
+                    dtype=query.dtype,
+                    device=query.device)
+                ipex_ops.varlen_attention(query,
+                                          key,
+                                          value,
+                                          output,
+                                          attn_metadata.seqlen_q,
+                                          attn_metadata.seqlen_q,
+                                          attn_metadata.max_seqlen,
+                                          attn_metadata.max_seqlen,
+                                          pdropout=0.0,
+                                          softmax_scale=self.scale,
+                                          zero_tensors=False,
+                                          is_causal=True,
+                                          return_softmax=False,
+                                          gen_=None)
+            else:
+                # prefix-enabled attention
+                raise RuntimeError(
+                    "IPEX backend doesn't support prefix decoding.")
+
+        else:
+            # Decoding run.
+            max_seq_len = attn_metadata.max_decode_seq_len
+            output = torch.empty_like(query)
+            block_size = value_cache.shape[3]
+            num_seqs, num_heads, head_size = query.shape
+            max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
+                                  _PARTITION_SIZE)
+            # NOTE(woosuk): We use a simple heuristic to decide whether to use
+            # PagedAttention V1 or V2. If the number of partitions is 1, we use
+            # V1 to avoid the overhead of reduction. Also, if the number of
+            # sequences or heads is large, we use V1 since there is enough work
+            # to parallelize.
+            # TODO(woosuk): Tune this heuristic.
+            # For context len > 8192, use V2 kernel to avoid shared memory
+            # shortage.
+            use_v1 = (max_seq_len <= 8192 and
+                      (max_num_partitions == 1 or num_seqs * num_heads > 512))
+            if use_v1:
+                # Run PagedAttention V1.
+                ipex_ops.paged_attention_v1(
+                    output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    self.num_kv_heads,
+                    self.scale,
+                    attn_metadata.block_tables,
+                    attn_metadata.seq_lens_tensor,
+                    block_size,
+                    max_seq_len,
+                    self.alibi_slopes,
+                    self.kv_cache_dtype,
+                    kv_scale,
+                )
+            else:
+                # Run PagedAttention V2.
+                assert _PARTITION_SIZE % block_size == 0
+                tmp_output = torch.empty(
+                    size=(num_seqs, num_heads, max_num_partitions, head_size),
+                    dtype=output.dtype,
+                    device=output.device,
+                )
+                exp_sums = torch.empty(
+                    size=(num_seqs, num_heads, max_num_partitions),
+                    dtype=torch.float32,
+                    device=output.device,
+                )
+                max_logits = torch.empty_like(exp_sums)
+                ipex_ops.paged_attention_v2(
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    self.num_kv_heads,
+                    self.scale,
+                    attn_metadata.block_tables,
+                    attn_metadata.seq_lens_tensor,
+                    block_size,
+                    max_seq_len,
+                    self.alibi_slopes,
+                    self.kv_cache_dtype,
+                    kv_scale,
+                )
+
+            # Reshape the output tensor.
+        return output.view(-1, self.num_heads * self.head_size)
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    dtype: torch.dtype,
+    seq_lens: List[int],
+) -> List[torch.Tensor]:
+    attn_biases = []
+    for seq_len in seq_lens:
+        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
+        # NOTE(zhuohan): HF uses
+        #     `bias = bias[None, :].repeat(seq_len, 1)`
+        # here. We find that both biases give the same results, but
+        # the bias below more accurately follows the original ALiBi
+        # paper.
+        bias = bias[None, :] - bias[:, None]
+
+        num_heads = alibi_slopes.shape[0]
+        bias = bias[None, :].repeat((num_heads, 1, 1))
+        bias.mul_(alibi_slopes[:, None, None])
+        inf_mask = torch.empty(
+            (1, seq_len, seq_len),
+            dtype=bias.dtype,
+            device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1)
+        attn_biases.append((bias + inf_mask).to(dtype))
+
+    return attn_biases
+
+
+def _make_sliding_window_bias(
+    seq_lens: List[int],
+    window_size: Optional[int],
+    dtype: torch.dtype,
+) -> List[torch.Tensor]:
+    attn_biases = []
+    for seq_len in seq_lens:
+        tensor = torch.full(
+            (1, seq_len, seq_len),
+            dtype=dtype,
+            fill_value=1,
+        )
+        shift = 0
+        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
+        if window_size is not None:
+            mask = torch.triu(mask, diagonal=shift - window_size + 1)
+        mask = torch.log(mask)
+        attn_biases.append(mask.to(dtype))
+
+    return attn_biases
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 8b07fb2d7..1d56d87cc 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -7,7 +7,7 @@ import torch
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_cpu, is_hip, is_tpu
+from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu
 
 logger = init_logger(__name__)
 
@@ -19,6 +19,7 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
     PALLAS = enum.auto()
+    IPEX = enum.auto()
 
 
 @lru_cache(maxsize=None)
@@ -58,12 +59,17 @@ def get_attn_backend(
             ROCmFlashAttentionBackend)
         return ROCmFlashAttentionBackend
     elif backend == _Backend.TORCH_SDPA:
-        # TODO: make XPU backend available here.
         assert is_cpu(), RuntimeError(
             "Torch SDPA backend is only used for the CPU device.")
         logger.info("Using Torch SDPA backend.")
         from vllm.attention.backends.torch_sdpa import TorchSDPABackend
         return TorchSDPABackend
+    elif backend == _Backend.IPEX:
+        assert is_xpu(), RuntimeError(
+            "IPEX attention backend is only used for the XPU device.")
+        logger.info("Using IPEX attention backend.")
+        from vllm.attention.backends.ipex_attn import IpexAttnBackend
+        return IpexAttnBackend
     elif backend == _Backend.FLASHINFER:
         logger.info("Using Flashinfer backend.")
         logger.warning("Eager mode is required for the Flashinfer backend. "
@@ -107,6 +113,11 @@ def which_attn_to_use(
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         return _Backend.TORCH_SDPA
 
+    if is_xpu():
+        if selected_backend != _Backend.IPEX:
+            logger.info("Cannot use %s backend on XPU.", selected_backend)
+        return _Backend.IPEX
+
     if is_tpu():
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
diff --git a/vllm/config.py b/vllm/config.py
index 552d5033f..b1a3a82f5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_tpu)
+                        is_hip, is_neuron, is_tpu, is_xpu)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -757,6 +757,8 @@ class DeviceConfig:
                 self.device_type = "tpu"
             elif is_cpu():
                 self.device_type = "cpu"
+            elif is_xpu():
+                self.device_type = "xpu"
             else:
                 # We don't call torch.cuda.is_available() here to
                 # avoid initializing CUDA before workers are forked
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 16c5297af..02b0dcbcb 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -58,7 +58,7 @@ def _split_tensor_dict(
             # because it contains not only the device type but also the device
             # index (e.g. "cuda:0"). We only need the device type.
             # receiving side will set the device index.
-            device = "cpu" if value.is_cpu else "cuda"
+            device = value.device.type
             metadata_list.append(
                 (key, TensorMetadata(device, value.dtype, value.size())))
             tensor_list.append(value)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ba53b5c86..9d04f1dc5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -501,11 +501,12 @@ class EngineArgs:
                   'Enabling this will use the fully sharded layers. '
                   'At high sequence length, max rank or '
                   'tensor parallel size, this is likely faster.'))
-        parser.add_argument("--device",
-                            type=str,
-                            default=EngineArgs.device,
-                            choices=["auto", "cuda", "neuron", "cpu", "tpu"],
-                            help='Device type for vLLM execution.')
+        parser.add_argument(
+            "--device",
+            type=str,
+            default=EngineArgs.device,
+            choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu"],
+            help='Device type for vLLM execution.')
 
         # Related to Vision-language models such as llava
         parser = EngineArgs.add_cli_args_for_vlm(parser)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 03b6d03a9..ab312850b 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -383,6 +383,17 @@ class AsyncLLMEngine:
                 "Distributed execution is not supported with the CPU backend.")
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
+        elif engine_config.device_config.device_type == "xpu":
+            if distributed_executor_backend is None:
+                from vllm.executor.xpu_executor import XPUExecutorAsync
+                executor_class = XPUExecutorAsync
+            elif distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync
+                executor_class = RayXPUExecutorAsync
+            else:
+                raise RuntimeError(
+                    "Not supported distributed execution model on XPU device.")
         elif distributed_executor_backend == "ray":
             initialize_ray_cluster(engine_config.parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index fd64337d4..eed9a17e4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -347,6 +347,14 @@ class LLMEngine:
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
+        elif engine_config.device_config.device_type == "xpu":
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_xpu_executor import RayXPUExecutor
+                executor_class = RayXPUExecutor
+            else:
+                from vllm.executor.xpu_executor import XPUExecutor
+                executor_class = XPUExecutor
         elif distributed_executor_backend == "ray":
             initialize_ray_cluster(engine_config.parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutor
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 4704f5f1b..495fddd17 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_ip, is_hip
+from vllm.utils import get_ip, is_hip, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -71,7 +71,7 @@ def initialize_ray_cluster(
             "serving.")
 
     # Connect to a ray cluster.
-    if is_hip():
+    if is_hip() or is_xpu():
         ray.init(address=ray_address,
                  ignore_reinit_error=True,
                  num_gpus=parallel_config.world_size)
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
new file mode 100644
index 000000000..dd7c82289
--- /dev/null
+++ b/vllm/executor/ray_xpu_executor.py
@@ -0,0 +1,401 @@
+import asyncio
+import os
+import pickle
+from collections import defaultdict
+from itertools import islice, repeat
+from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
+                    Tuple, Union)
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, VisionLanguageConfig)
+from vllm.executor.distributed_gpu_executor import (  # yapf: disable
+    DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+# If the env var is set, it uses the Ray's compiled DAG API
+# which optimizes the control plane overhead.
+# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+
+
+class RayXPUExecutor(DistributedGPUExecutor):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        speculative_config: Optional[SpeculativeConfig],
+    ) -> None:
+        assert device_config.device_type == "xpu"
+        assert (not speculative_config
+                ), "Speculative decoding not yet supported for XPU backend"
+
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.vision_language_config = vision_language_config
+
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+        # Profile the memory usage and initialize the cache.
+        self.forward_dag = None
+        if USE_RAY_COMPILED_DAG:
+            self.forward_dag = self._compiled_ray_dag()
+
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        # Updated by implementations that require additional args to be passed
+        # to the _run_workers execute_model call
+        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
+
+    def _init_executor(self) -> None:
+        pass
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks.
+
+        This invokes `determine_num_available_blocks` on each worker and takes
+        the min of the results, guaranteeing that the selected cache sizes are
+        compatible with all workers.
+
+        Returns:
+            - Tuple[num_gpu_blocks, num_cpu_blocks]
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers("determine_num_available_blocks", )
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+
+        return num_gpu_blocks, num_cpu_blocks
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        if self.parallel_config.tensor_parallel_size == 1:
+            # For single GPU case, we use a ray worker with constrained memory.
+            num_gpus = self.cache_config.gpu_memory_utilization
+        else:
+            # Otherwise, the ray workers are allocated with a full GPU.
+            num_gpus = 1
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=num_gpus,
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerWrapper).remote(
+                worker_module_name="vllm.worker.xpu_worker",
+                worker_class_name="XPUWorker",
+                trust_remote_code=self.model_config.trust_remote_code,
+            )
+
+            worker_ip = ray.get(worker.get_node_ip.remote())
+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                # If the worker is on the same node as the driver, we use it
+                # as the resource holder for the driver process.
+                self.driver_dummy_worker = worker
+                self.driver_worker = RayWorkerWrapper(
+                    worker_module_name="vllm.worker.xpu_worker",
+                    worker_class_name="XPUWorker",
+                    trust_remote_code=self.model_config.trust_remote_code,
+                )
+            else:
+                # Else, added to the list of workers.
+                self.workers.append(worker)
+        if self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node.")
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+                                                    use_dummy_driver=True)
+
+        node_workers = defaultdict(list)
+        node_gpus = defaultdict(list)
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        # TODO: add env var for xpu
+
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        def collect_arg_helper_func(**kwargs):
+            # avoid writing `{"name": value}` manually
+            return kwargs
+
+        init_worker_all_kwargs = []
+
+        # Initialize the actual workers inside worker wrapper.
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ):
+            local_rank = node_workers[node_id].index(rank)
+            init_worker_all_kwargs.append(
+                collect_arg_helper_func(
+                    model_config=self.model_config,
+                    parallel_config=self.parallel_config,
+                    scheduler_config=self.scheduler_config,
+                    device_config=self.device_config,
+                    cache_config=self.cache_config,
+                    load_config=self.load_config,
+                    local_rank=local_rank,
+                    rank=rank,
+                    distributed_init_method=distributed_init_method,
+                    lora_config=self.lora_config,
+                    vision_language_config=self.vision_language_config,
+                    is_driver_worker=rank == 0,
+                ))
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+
+        self._run_workers("init_device")
+        self._run_workers(
+            "load_model",
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache in all workers.
+        """
+
+        # NOTE: We log here to avoid multiple logs when number of workers is
+        # greater than one. We could log in the engine, but not all executors
+        # have GPUs.
+        logger.info("# GPU blocks: %d, "
+                    "# CPU blocks: %d", num_gpu_blocks, num_cpu_blocks)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self._run_workers("initialize_cache",
+                          num_gpu_blocks=num_gpu_blocks,
+                          num_cpu_blocks=num_cpu_blocks)
+
+    def _driver_execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "add_lora",
+            lora_request=lora_request,
+        )
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "remove_lora",
+            lora_id=lora_id,
+        )
+
+    def list_loras(self) -> Set[int]:
+        return self._run_workers("list_loras")
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_remote_workers_only: bool = False,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        use_dummy_driver: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        use_ray_compiled_dag: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+
+        - args/kwargs: All workers share the same args/kwargs
+        - args/kwargs and driver_args/driver_kwargs: Driver worker has
+          different args
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 1, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 1, None)
+
+        if use_ray_compiled_dag:
+            # Right now, compiled DAG can only accept a single
+            # input. TODO(sang): Fix it.
+            assert self.forward_dag is not None
+            output_channels = self.forward_dag.execute(1)
+        else:
+            # Start the ray workers first.
+            ray_worker_outputs = [
+                worker.execute_method.remote(method, *worker_args,
+                                             **worker_kwargs)
+                for (worker, worker_args, worker_kwargs
+                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+            ]
+        if async_run_remote_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+
+        driver_args = args if all_args is None else all_args[0]
+        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+        # Start the driver worker after all the ray workers.
+        if not use_dummy_driver:
+            driver_worker_output = self.driver_worker.execute_method(
+                method, *driver_args, **driver_kwargs)
+        else:
+            assert self.driver_dummy_worker is not None
+            driver_worker_output = ray.get(
+                self.driver_dummy_worker.execute_method.remote(
+                    method, *driver_args, **driver_kwargs))
+        # Get the results of the ray workers.
+        if self.workers:
+            if use_ray_compiled_dag:
+                try:
+                    ray_worker_outputs = [
+                        pickle.loads(chan.begin_read())
+                        for chan in output_channels
+                    ]
+                finally:
+                    # Has to call end_read in order to reuse the DAG.
+                    for chan in output_channels:
+                        chan.end_read()
+            else:
+                ray_worker_outputs = ray.get(ray_worker_outputs)
+
+        return [driver_worker_output] + ray_worker_outputs
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+
+    def _compiled_ray_dag(self):
+        import pkg_resources
+        required_version = "2.9"
+        current_version = pkg_resources.get_distribution("ray").version
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} or greater is "
+                             f"required, but found {current_version}")
+
+        from ray.dag import InputNode, MultiOutputNode
+        assert self.parallel_config.worker_use_ray
+
+        # Right now, compiled DAG requires at least 1 arg. We send
+        # a dummy value for now. It will be fixed soon.
+        with InputNode() as input_data:
+            forward_dag = MultiOutputNode([
+                worker.execute_model_compiled_dag_remote.
+                bind(  # type: ignore[attr-defined]
+                    input_data) for worker in self.workers
+            ])
+        return forward_dag.experimental_compile()
+
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+
+    def _check_if_any_actor_is_dead(self):
+        if not self.workers:
+            return
+
+        dead_actors = []
+        for actor in self.workers:
+            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
+            if actor_state["State"] == "DEAD":
+                dead_actors.append(actor)
+        if dead_actors:
+            raise RuntimeError("At least one Worker is dead. "
+                               f"Dead Workers: {dead_actors}. ")
+
+
+class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.driver_exec_method = make_async(self.driver_worker.execute_method)
+
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        return await self.driver_exec_method("execute_model",
+                                             execute_model_req)
+
+    async def _start_worker_execution_loop(self):
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.workers
+        ]
+        return await asyncio.gather(*coros)
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
new file mode 100644
index 000000000..d37200bd0
--- /dev/null
+++ b/vllm/executor/xpu_executor.py
@@ -0,0 +1,98 @@
+from typing import List, Optional
+
+import torch
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, VisionLanguageConfig)
+from vllm.executor.executor_base import ExecutorAsyncBase
+from vllm.executor.gpu_executor import GPUExecutor
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import make_async
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class XPUExecutor(GPUExecutor):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        speculative_config: Optional[SpeculativeConfig],
+    ) -> None:
+        assert device_config.device_type == "xpu"
+        assert (not speculative_config
+                ), "Speculative decoding not yet supported for XPU backend"
+
+        model_config = _verify_and_get_model_config(model_config)
+
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.vision_language_config = vision_language_config
+        self.speculative_config = None
+
+        # Instantiate the worker and load the model to GPU.
+        self._init_executor()
+
+    def _create_worker(self,
+                       local_rank: int = 0,
+                       rank: int = 0,
+                       distributed_init_method: Optional[str] = None):
+        if self.speculative_config is None:
+            worker_module_name = "vllm.worker.xpu_worker"
+            worker_class_name = "XPUWorker"
+        else:
+            raise NotImplementedError(
+                "XPU does not support speculative decoding")
+
+        wrapper = WorkerWrapperBase(
+            worker_module_name=worker_module_name,
+            worker_class_name=worker_class_name,
+        )
+        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
+                                                      distributed_init_method))
+        return wrapper.worker
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+
+class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req)
+        return output
+
+
+def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+    if config.dtype == torch.bfloat16:
+        logger.warning(
+            "bfloat16 is not fully supported on XPU, casting to float16.")
+        config.dtype = torch.float16
+    if not config.enforce_eager:
+        logger.warning(
+            "CUDA graph is not supported on XPU, fallback to the eager "
+            "mode.")
+        config.enforce_eager = True
+    return config
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 56aa629ae..0db72d8d9 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from vllm.utils import is_cpu, is_hip, is_tpu
+from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu
 
 
 class CustomOp(nn.Module):
@@ -29,9 +29,7 @@ class CustomOp(nn.Module):
         return self.forward_cuda(*args, **kwargs)
 
     def forward_xpu(self, *args, **kwargs):
-        # By default, we assume that XPU ops are compatible with CUDA ops.
-        # NOTE(woosuk): This is a placeholder for future extensions.
-        return self.forward_cuda(*args, **kwargs)
+        raise NotImplementedError
 
     def forward_cpu(self, *args, **kwargs):
         # By default, we assume that CPU ops are compatible with CUDA ops.
@@ -58,5 +56,7 @@ class CustomOp(nn.Module):
             return self.forward_cpu
         elif is_tpu():
             return self.forward_tpu
+        elif is_xpu():
+            return self.forward_xpu
         else:
             return self.forward_cuda
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 4d076421f..eb0606948 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -37,6 +37,15 @@ class SiluAndMul(CustomOp):
         ops.silu_and_mul(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
 
 class GeluAndMul(CustomOp):
     """An activation function for GeGLU.
@@ -71,6 +80,18 @@ class GeluAndMul(CustomOp):
             ops.gelu_tanh_and_mul(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if self.approximate == "none":
+            ops.gelu_and_mul(out, x)
+        elif self.approximate == "tanh":
+            ops.gelu_tanh_and_mul(out, x)
+        return out
+
     def extra_repr(self) -> str:
         return f'approximate={repr(self.approximate)}'
 
@@ -90,6 +111,13 @@ class NewGELU(CustomOp):
         ops.gelu_new(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
 
 class FastGELU(CustomOp):
 
@@ -105,6 +133,13 @@ class FastGELU(CustomOp):
         ops.gelu_fast(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
 
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 4533adf8f..14f5e2378 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -67,6 +67,30 @@ class RMSNorm(CustomOp):
         )
         return out
 
+    def forward_xpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        if residual is not None:
+            ops.fused_add_rms_norm(
+                x,
+                residual,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return x, residual
+        out = torch.empty_like(x)
+        ops.rms_norm(
+            out,
+            x,
+            self.weight.data,
+            self.variance_epsilon,
+        )
+        return out
+
     def extra_repr(self) -> str:
         s = f"hidden_size={self.weight.data.size(0)}"
         s += f", eps={self.variance_epsilon}"
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 5a4940acb..9c0a74cda 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -221,6 +221,29 @@ class RotaryEmbedding(CustomOp):
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
+                                                   dtype=query.dtype)
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
+        if offsets is not None:
+            ops.batched_rotary_embedding(positions, query, key, self.head_size,
+                                         self.cos_sin_cache,
+                                         self.is_neox_style, self.rotary_dim,
+                                         offsets)
+        else:
+            ops.rotary_embedding(positions, query, key, self.head_size,
+                                 self.cos_sin_cache, self.is_neox_style)
+        return query, key
+
     def forward_tpu(
         self,
         positions: torch.Tensor,
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 60eb5b404..1a26c5c63 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -307,7 +307,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         else:
             masked_input = input_
             # Get the embeddings.
-        output_parallel = F.embedding(masked_input, self.weight)
+        output_parallel = F.embedding(masked_input.long(), self.weight)
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(1), 0)
diff --git a/vllm/utils.py b/vllm/utils.py
index 9b39ca77a..1adfa9218 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -160,6 +160,26 @@ def is_tpu() -> bool:
     return libtpu is not None
 
 
+@lru_cache(maxsize=None)
+def is_xpu() -> bool:
+    from importlib.metadata import version
+    is_xpu_flag = "xpu" in version("vllm")
+    # vllm is not build with xpu
+    if not is_xpu_flag:
+        return False
+    try:
+        import intel_extension_for_pytorch as ipex  # noqa: F401
+        _import_ipex = True
+    except ImportError as e:
+        logger.warning("Import Error for IPEX: %s", e.msg)
+        _import_ipex = False
+    # ipex dependency is not ready
+    if not _import_ipex:
+        logger.warning("not found ipex lib")
+        return False
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
@@ -482,6 +502,9 @@ def is_pin_memory_available() -> bool:
         print_warning_once("Using 'pin_memory=False' as WSL is detected. "
                            "This may slow down the performance.")
         return False
+    elif is_xpu():
+        print_warning_once("Pin memory is not supported on XPU.")
+        return False
     elif is_neuron():
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
@@ -497,8 +520,12 @@ class CudaMemoryProfiler:
 
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
-        torch.cuda.reset_peak_memory_stats(self.device)
-        mem = torch.cuda.max_memory_allocated(self.device)
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats(self.device)
+            mem = torch.cuda.max_memory_allocated(self.device)
+        elif is_xpu():
+            torch.xpu.reset_peak_memory_stats(self.device)
+            mem = torch.xpu.max_memory_allocated(self.device)
         return mem
 
     def __enter__(self):
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 341b177d4..fbd1343fe 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -4,7 +4,7 @@ from typing import List
 import torch
 
 from vllm.attention import get_attn_backend
-from vllm.config import CacheConfig, ModelConfig, ParallelConfig
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
                         is_pin_memory_available)
@@ -25,10 +25,12 @@ class CacheEngine:
         cache_config: CacheConfig,
         model_config: ModelConfig,
         parallel_config: ParallelConfig,
+        device_config: DeviceConfig,
     ) -> None:
         self.cache_config = cache_config
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.device_config = device_config
 
         self.head_size = model_config.get_head_size()
         self.num_layers = model_config.get_num_layers(parallel_config)
@@ -55,7 +57,8 @@ class CacheEngine:
         )
 
         # Initialize the cache.
-        self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda")
+        self.gpu_cache = self._allocate_kv_cache(
+            self.num_gpu_blocks, self.device_config.device_type)
         self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
 
     def _allocate_kv_cache(
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 7a378a862..f9b8a065a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -205,7 +205,8 @@ class Worker(WorkerBase):
     def _init_cache_engine(self):
         assert self.cache_config.num_gpu_blocks is not None
         self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config)
+                                        self.parallel_config,
+                                        self.device_config)
         self.gpu_cache = self.cache_engine.gpu_cache
 
     def _warm_up_model(self) -> None:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
new file mode 100644
index 000000000..f30de703e
--- /dev/null
+++ b/vllm/worker/xpu_model_runner.py
@@ -0,0 +1,417 @@
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.attention import get_attn_backend
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.distributed import broadcast_tensor_dict
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
+from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
+
+logger = init_logger(__name__)
+
+_PAD_SLOT_ID = -1
+_BATCH_SIZE_ALIGNMENT = 8
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
+    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
+]
+
+
+class XPUModelRunner:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        *args,
+        **kwargs,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.lora_config = lora_config
+        self.load_config = load_config
+        self.cache_config = cache_config
+        self.vision_language_config = vision_language_config
+        self.is_driver_worker = is_driver_worker
+
+        self.sliding_window = model_config.get_sliding_window()
+        self.device_config = device_config
+        self.device = self.device_config.device
+
+        self.kv_cache_dtype = kv_cache_dtype
+        self.block_size = cache_config.block_size
+        self.max_context_len_to_capture = (
+            self.model_config.max_context_len_to_capture
+            if self.model_config is not None else 0)
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.model_config.get_head_size(),
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+        )
+
+        # Lazy initialization.
+        self.model: nn.Module  # Set after init_Model
+
+    def load_model(self) -> None:
+        with CudaMemoryProfiler() as m:
+            self.model = get_model(
+                model_config=self.model_config,
+                device_config=self.device_config,
+                load_config=self.load_config,
+                lora_config=self.lora_config,
+                vision_language_config=self.vision_language_config,
+                parallel_config=self.parallel_config,
+                scheduler_config=self.scheduler_config,
+                cache_config=self.cache_config,
+            )
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info("Loading model weights took %.4f GB",
+                    self.model_memory_usage / float(2**30))
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        # Enable top-k sampling to reflect the accurate memory usage.
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+
+        # Profile memory usage with max_num_sequences sequences and the total
+        # number of tokens equal to max_num_batched_tokens.
+        seqs: List[SequenceGroupMetadata] = []
+        # Additional GPU memory may be needed for vision encoding, which needs
+        # to be accounted for when calculating the GPU blocks for
+        # vLLM blocker manager.
+        # To exercise the worst scenario for GPU memory consumption,
+        # the number of seqs (batch_size) is chosen to maximize the number
+        # of images processed.
+        for group_id in range(max_num_seqs):
+            seq_len = (max_num_batched_tokens // max_num_seqs +
+                       (group_id < max_num_batched_tokens % max_num_seqs))
+
+            seq_data = SequenceData([0] * seq_len)
+            dummy_multi_modal_data = None
+            seq = SequenceGroupMetadata(
+                request_id=str(group_id),
+                is_prompt=True,
+                seq_data={group_id: seq_data},
+                sampling_params=sampling_params,
+                block_tables=None,
+                lora_request=None,
+                multi_modal_data=dummy_multi_modal_data,
+            )
+            seqs.append(seq)
+
+        # Run the model with the dummy inputs.
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        kv_caches = [None] * num_layers
+        self.execute_model(seqs, kv_caches)
+        torch.xpu.synchronize()
+        return
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+               Optional[torch.Tensor]]:
+        multi_modal_input = None
+        if self.is_driver_worker:
+            # NOTE: We assume that all sequences in the group are all prompts or
+            # all decodes.
+            is_prompt = seq_group_metadata_list[0].is_prompt
+            # Prepare input tensors.
+            if is_prompt:
+                (input_tokens, input_positions, attn_metadata, seq_lens,
+                 multi_modal_input
+                 ) = self._prepare_prompt(seq_group_metadata_list)
+            else:
+                (input_tokens, input_positions,
+                 attn_metadata) = self._prepare_decode(seq_group_metadata_list)
+                seq_lens = []
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list,
+                seq_lens,
+                # subquery_lens is not needed if chunked prefill is not
+                # supported. Since CPU worker doesn't support chunked prefill
+                # just use seq_lens instead.
+                seq_lens,
+                self.device,
+                pin_memory=False)
+            # Broadcast the metadata.
+            metadata_dict = {
+                "input_tokens": input_tokens,
+                "input_positions": input_positions,
+                "selected_token_indices":
+                sampling_metadata.selected_token_indices,
+            }
+            metadata_dict.update(attn_metadata.asdict_zerocopy())
+            broadcast_tensor_dict(metadata_dict, src=0)
+        else:
+            metadata_dict = broadcast_tensor_dict(src=0)
+            input_tokens = metadata_dict.pop("input_tokens")
+            input_positions = metadata_dict.pop("input_positions")
+            selected_token_indices = metadata_dict.pop(
+                "selected_token_indices")
+            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
+            sampling_metadata = SamplingMetadata(
+                seq_groups=None,
+                selected_token_indices=selected_token_indices,
+                categorized_sample_indices=None,
+                num_prompts=0,
+            )
+
+        return (input_tokens, input_positions, attn_metadata,
+                sampling_metadata, multi_modal_input)
+
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        slot_mapping: List[int] = []
+        seq_lens: List[int] = []
+        block_tables: List[List[int]] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+            assert seq_group_metadata.token_chunk_size == 1
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append(generation_token)
+
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append(position)
+
+                seq_len = seq_len if self.sliding_window is None else min(
+                    seq_len, self.sliding_window)
+                seq_lens.append(seq_len)
+
+                block_table = seq_group_metadata.block_tables[seq_id]
+                block_number = block_table[position // self.block_size]
+                block_offset = position % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append(slot)
+
+                if self.sliding_window is not None:
+                    sliding_window_blocks = (self.sliding_window //
+                                             self.block_size)
+                    block_table = block_table[-sliding_window_blocks:]
+                block_tables.append(block_table)
+
+        max_decode_seq_len = max(seq_lens)
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.long,
+                                    device=self.device)
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=self.device)
+
+        max_block_table_len = max(
+            len(block_table) for block_table in block_tables)
+        block_tables = make_tensor_with_pad(
+            block_tables,
+            max_len=max_block_table_len,
+            pad=0,
+            dtype=torch.int,
+            device=self.device,
+        )
+
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=False,
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seqlen_q=None,
+            max_seqlen=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_seq_len=max_decode_seq_len,
+            num_prefill_tokens=0,
+            num_decode_tokens=len(input_tokens),
+            num_prefills=0,
+            block_tables=block_tables,
+        )
+        return (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+        )
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[torch.Tensor],
+    ) -> Optional[SamplerOutput]:
+        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+         multi_modal_input
+         ) = self.prepare_input_tensors(seq_group_metadata_list)
+
+        model_executable = self.model
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Only perform sampling in the driver worker.
+        if not self.is_driver_worker:
+            return None
+
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+        return output
+
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               Optional[torch.Tensor]]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        slot_mapping: List[int] = []
+        seq_lens: List[int] = []
+        multi_modal_input_list: List[torch.Tensor] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            prompt_tokens = seq_data.get_token_ids()
+            computed_len = seq_data.get_num_computed_tokens()
+            seq_len = len(prompt_tokens)
+
+            seq_lens.append(seq_len)  # Prompt token num
+            input_tokens.extend(prompt_tokens)  # Token ids
+
+            # Token position ids
+            # NOTE(woosuk): Here we assume that the first token in the prompt
+            # is always the first token in the sequence.
+            input_positions.extend(list(range(computed_len, seq_len)))
+
+            if seq_group_metadata.multi_modal_data:
+                multi_modal_input_list.append(
+                    seq_group_metadata.multi_modal_data.data)
+
+            if seq_group_metadata.block_tables is None:
+                # During memory profiling, the block tables are not initialized
+                # yet. In this case, we just use a dummy slot mapping.
+                slot_mapping.extend([_PAD_SLOT_ID] * seq_len)
+                continue
+
+            # Compute the slot mapping.
+            block_table = seq_group_metadata.block_tables[seq_id]
+            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
+            # where start_idx is max(0, seq_len - sliding_window).
+            # For example, if the prompt len is 10, sliding window is 8, and
+            # block size is 4, the first two tokens are masked and the slot
+            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+            start_idx = 0
+            if self.sliding_window is not None:
+                start_idx = max(0, seq_len - self.sliding_window)
+
+            for i in range(computed_len, seq_len):
+                if i < start_idx:
+                    slot_mapping.append(_PAD_SLOT_ID)
+                    continue
+
+                block_number = block_table[i //
+                                           self.block_size]  # type: ignore
+                block_offset = i % self.block_size  # type: ignore
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append(slot)
+
+        if multi_modal_input_list:
+            assert self.vision_language_config, (
+                "Multi-modal inputs are only supported by "
+                "vision language models.")
+            multi_modal_input = torch.cat(multi_modal_input_list,
+                                          dim=0).to(self.device)
+        else:
+            multi_modal_input = None
+
+        num_prompt_tokens = len(input_tokens)
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)  # type: ignore
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)  # type: ignore
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.long,
+                                    device=self.device)  # type: ignore
+
+        max_seqlen = max(seq_lens)
+        tmp = [0]
+        tmp.extend(seq_lens)
+        seqlen = torch.tensor(tmp)
+        seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device)
+
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=True,
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seqlen_q=seqlen_q,
+            max_seqlen=max_seqlen,
+            seq_lens_tensor=None,
+            max_decode_seq_len=None,
+            num_prefills=len(seq_lens),
+            num_prefill_tokens=num_prompt_tokens,
+            num_decode_tokens=0,
+            block_tables=torch.tensor([], device=self.device, dtype=torch.int),
+        )
+        return (input_tokens, input_positions, attn_metadata, seq_lens,
+                multi_modal_input)
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
new file mode 100644
index 000000000..773ee9f81
--- /dev/null
+++ b/vllm/worker/xpu_worker.py
@@ -0,0 +1,193 @@
+"""A XPU worker class."""
+import gc
+import os
+from typing import List, Optional, Tuple
+
+import intel_extension_for_pytorch  # noqa: F401
+import oneccl_bindings_for_pytorch  # noqa: F401
+import torch
+import torch.distributed
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, VisionLanguageConfig)
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.utils import is_xpu
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.worker import Worker
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.xpu_model_runner import XPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class XPUWorker(LoraNotSupportedWorkerBase, Worker):
+    """A worker class that executes (a partition of) the model on a GPU.
+    
+    Each worker is associated with a single XPU device. The worker is 
+    responsible for maintaining the KV cache and executing the model on the 
+    XPU. In case of distributed inference, each worker is assigned a partition
+    of the model.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+        speculative_config: Optional[SpeculativeConfig] = None,
+        is_driver_worker: bool = False,
+    ) -> None:
+        assert device_config.device_type == "xpu"
+        assert is_xpu()
+
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+
+        self.vision_language_config = vision_language_config
+        if self.vision_language_config:
+            assert not self.lora_config, (
+                "To be tested: vision language model with LoRA settings.")
+
+        self.model_runner = XPUModelRunner(  # type: ignore
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            cache_config,
+            load_config=self.load_config,
+            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=is_driver_worker,
+            vision_language_config=vision_language_config,
+        )
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: CacheEngine
+        self.gpu_cache: List[torch.Tensor]
+
+    def init_device(self) -> None:
+        if self.device_config.device.type == "xpu" and is_xpu():
+            self.device = torch.device(f"xpu:{self.local_rank}")
+            torch.xpu.set_device(self.device)
+            torch.xpu.empty_cache()
+            self.init_gpu_memory = torch.xpu.get_device_properties(
+                self.local_rank).total_memory
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        self.init_worker_distributed_environment()
+        # Initialize the model.
+        set_random_seed(self.model_config.seed)
+
+    # keep this method for `empty_cache` and `synchronize` api
+    @torch.inference_mode()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        torch.xpu.empty_cache()
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        self.model_runner.profile_run()
+
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        torch.xpu.synchronize()
+        used_memory = torch.xpu.memory_allocated()
+        total_gpu_memory = torch.xpu.get_device_properties(
+            self.local_rank).total_memory
+        free_gpu_memory = total_gpu_memory - used_memory
+
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        peak_memory = self.init_gpu_memory - free_gpu_memory
+        assert peak_memory > 0, (
+            "Error in memory profiling. This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
+        cache_block_size = self.get_cache_block_size_bytes()
+        num_gpu_blocks = int(
+            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+             peak_memory) // cache_block_size)
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+        gc.collect()
+        torch.xpu.empty_cache()
+        return num_gpu_blocks, num_cpu_blocks
+
+    def _warm_up_model(self) -> None:
+        # IPEX don't support capture graph yet
+        pass
+
+    def init_worker_distributed_environment(self) -> None:
+        """Initialize the distributed environment."""
+
+        parallel_config = self.parallel_config
+        rank = self.rank
+        distributed_init_method = self.distributed_init_method
+
+        if torch.distributed.is_initialized():
+            torch_world_size = torch.distributed.get_world_size()
+            if torch_world_size != parallel_config.world_size:
+                raise RuntimeError(
+                    "torch.distributed is already initialized but the torch "
+                    "world size does not match parallel_config.world_size "
+                    f"({torch_world_size} vs. {parallel_config.world_size}).")
+        elif not distributed_init_method:
+            raise ValueError(
+                "distributed_init_method must be set if torch.distributed "
+                "is not already initialized")
+        else:
+            # use sockets as default Level zero IPC exchange backend. By
+            # default oneccl will use `drmfd` as mechanism which need extra
+            # dependency (libdrm and drm headers) on your system.
+            ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE",
+                                                "sockets")
+            os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE
+            init_distributed_environment(
+                world_size=parallel_config.world_size,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                local_rank=self.local_rank,
+                backend="ccl")
+
+        ensure_model_parallel_initialized(
+            parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size)
-- 
GitLab


From ab66536dbfedff4ffcbb6dc9f9a21d0a9ac0ec91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Tue, 18 Jun 2024 02:36:10 +0800
Subject: [PATCH 069/376] [CI/BUILD] Support non-AVX512 vLLM building and
 testing (#5574)

---
 .buildkite/run-cpu-test.sh |  5 ++++-
 Dockerfile.cpu             |  4 ++++
 cmake/cpu_extension.cmake  | 13 ++++++++++++-
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 5f9ca5d75..532d6ad88 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -4,17 +4,20 @@ set -ex
 
 # Try building the docker image
 docker build -t cpu-test -f Dockerfile.cpu .
+docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
+remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
 docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 777bb0829..6e55203de 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -21,6 +21,10 @@ WORKDIR /workspace/vllm
 
 RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index a644e5b6a..511e443f7 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -33,10 +33,21 @@ function (find_isa CPUINFO TARGET OUT)
     endif()
 endfunction()
 
+function (is_avx512_disabled OUT)
+    set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
+    if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
+        set(${OUT} ON PARENT_SCOPE)
+    else()
+        set(${OUT} OFF PARENT_SCOPE)
+    endif()
+endfunction()
+
+is_avx512_disabled(AVX512_DISABLED)
+
 find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 
-if (AVX512_FOUND)
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
         "-mavx512f"
         "-mavx512vl"
-- 
GitLab


From 9e4e6fe2073ff5e4a747d5ce2a08d321268b7254 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 17 Jun 2024 11:41:08 -0700
Subject: [PATCH 070/376] [CI] the readability of benchmarking and prepare for
 dashboard (#5571)

[CI] Improve the readability of performance benchmarking results and prepare for upcoming performance dashboard (#5571)
---
 .buildkite/nightly-benchmarks/README.md       |  21 +-
 .../run-benchmarks-suite.sh                   |   6 +-
 .../convert-results-json-to-markdown.py       | 260 ++++++++++--------
 .../nightly-benchmarks/tests/descriptions.md  |  67 +++++
 .../{ => tests}/latency-tests.json            |   2 +-
 .../{ => tests}/serving-tests.json            |   2 +-
 .../{ => tests}/throughput-tests.json         |   2 +-
 benchmarks/benchmark_latency.py               |   2 +-
 8 files changed, 232 insertions(+), 130 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/tests/descriptions.md
 rename .buildkite/nightly-benchmarks/{ => tests}/latency-tests.json (99%)
 rename .buildkite/nightly-benchmarks/{ => tests}/serving-tests.json (99%)
 rename .buildkite/nightly-benchmarks/{ => tests}/throughput-tests.json (99%)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 6a18be947..4036b32a4 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -13,9 +13,17 @@ This benchmark will be *triggered* upon:
 
 **Benchmarking Duration**: about 1hr.
 
-## Configuring the workload for the quick benchmark
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
 
-The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`.
+
+## Configuring the workload
+
+The benchmarking workload contains three parts:
+- Latency tests in `latency-tests.json`.
+- Throughput tests in `throughput-tests.json`.
+- Serving tests in `serving-tests.json`.
+
+See [descriptions.md](tests/descriptions.md) for detailed descriptions. 
 
 ### Latency test
 
@@ -23,7 +31,6 @@ Here is an example of one test inside `latency-tests.json`:
 
 ```json
 [
-    ...
     {
         "test_name": "latency_llama8B_tp1",
         "parameters": {
@@ -34,7 +41,6 @@ Here is an example of one test inside `latency-tests.json`:
             "num_iters": 15
         }
     },
-    ...
 ]
 ```
 
@@ -57,7 +63,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
 
 ```
 [
-    ...
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
@@ -77,7 +82,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
             "num_prompts": 200
         }
     },
-    ...
 ]
 ```
 
@@ -92,7 +96,8 @@ The number of this test is less stable compared to the delay and latency benchma
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 
 ## Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table.
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
-The JSON file is also attached within each buildkite job for further analysis.
\ No newline at end of file
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 6cff6917f..021473f76 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -343,9 +343,9 @@ main() {
   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
   # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json
-  run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
 
 
   # postprocess benchmarking results
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 75cff8434..9aa8162d1 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 
 import pandas as pd
@@ -11,12 +12,13 @@ latency_results = []
 latency_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "avg_latency": "Average latency (s)",
-    "P10": "P10 (s)",
-    "P25": "P25 (s)",
-    "P50": "P50 (s)",
-    "P75": "P75 (s)",
-    "P90": "P90 (s)",
+    "avg_latency": "Mean latency (ms)",
+    # "P10": "P10 (s)",
+    # "P25": "P25 (s)",
+    "P50": "Median",
+    # "P75": "P75 (s)",
+    # "P90": "P90 (s)",
+    "P99": "P99",
 }
 
 # thoughput tests and the keys that will be printed into markdown
@@ -24,11 +26,11 @@ throughput_results = []
 throughput_results_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "num_requests": "# of req.",
-    "total_num_tokens": "Total # of tokens",
-    "elapsed_time": "Elapsed time (s)",
+    # "num_requests": "# of req.",
+    # "total_num_tokens": "Total # of tokens",
+    # "elapsed_time": "Elapsed time (s)",
     "requests_per_second": "Tput (req/s)",
-    "tokens_per_second": "Tput (tok/s)",
+    # "tokens_per_second": "Tput (tok/s)",
 }
 
 # serving results and the keys that will be printed into markdown
@@ -36,120 +38,148 @@ serving_results = []
 serving_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "completed": "# of req.",
+    # "completed": "# of req.",
     "request_throughput": "Tput (req/s)",
-    "input_throughput": "Input Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
+    # "input_throughput": "Input Tput (tok/s)",
+    # "output_throughput": "Output Tput (tok/s)",
     "mean_ttft_ms": "Mean TTFT (ms)",
     # do not say TTFT again to avoid the table getting too wide
     "median_ttft_ms": "Median",
     "p99_ttft_ms": "P99",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "median_tpot_ms": "Median",
-    "p99_tpot_ms": "P99",
+    # "mean_tpot_ms": "Mean TPOT (ms)",
+    # "median_tpot_ms": "Median",
+    # "p99_tpot_ms": "P99",
     "mean_itl_ms": "Mean ITL (ms)",
     "median_itl_ms": "Median",
     "p99_itl_ms": "P99",
 }
 
-for test_file in results_folder.glob("*.json"):
-
-    with open(test_file, "r") as f:
-        raw_result = json.loads(f.read())
-
-    if "serving" in str(test_file):
-        # this result is generated via `benchmark_serving.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        serving_results.append(raw_result)
-        continue
-
-    elif "latency" in f.name:
-        # this result is generated via `benchmark_latency.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # get different percentiles
-        for perc in [10, 25, 50, 75, 90]:
-            raw_result.update(
-                {f"P{perc}": raw_result["percentiles"][str(perc)]})
-
-        # add the result to raw_result
-        latency_results.append(raw_result)
-        continue
-
-    elif "throughput" in f.name:
-        # this result is generated via `benchmark_throughput.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        throughput_results.append(raw_result)
-        continue
-
-    print(f"Skipping {test_file}")
-
-latency_results = pd.DataFrame.from_dict(latency_results)
-serving_results = pd.DataFrame.from_dict(serving_results)
-throughput_results = pd.DataFrame.from_dict(throughput_results)
-
-# remapping the key, for visualization purpose
-if not latency_results.empty:
-    latency_results = latency_results[list(
-        latency_column_mapping.keys())].rename(columns=latency_column_mapping)
-if not serving_results.empty:
-    serving_results = serving_results[list(
-        serving_column_mapping.keys())].rename(columns=serving_column_mapping)
-if not throughput_results.empty:
-    throughput_results = throughput_results[list(
-        throughput_results_column_mapping.keys())].rename(
-            columns=throughput_results_column_mapping)
-
-# get markdown tables
-latency_md_table = tabulate(latency_results,
-                            headers='keys',
-                            tablefmt='pipe',
-                            showindex=False)
-serving_md_table = tabulate(serving_results,
-                            headers='keys',
-                            tablefmt='pipe',
-                            showindex=False)
-throughput_md_table = tabulate(throughput_results,
-                               headers='keys',
-                               tablefmt='pipe',
-                               showindex=False)
-
-# document the result
-with open(results_folder / "benchmark_results.md", "w") as f:
+
+def read_markdown(file):
+    if os.path.exists(file):
+        with open(file, "r") as f:
+            return f.read() + "\n"
+    else:
+        return f"{file} not found.\n"
+
+
+def results_to_json(latency, throughput, serving):
+    return json.dumps({
+        'latency': latency.to_dict(),
+        'throughput': throughput.to_dict(),
+        'serving': serving.to_dict()
+    })
+
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file, "r") as f:
+            raw_result = json.loads(f.read())
+
+        if "serving" in str(test_file):
+            # this result is generated via `benchmark_serving.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            serving_results.append(raw_result)
+            continue
+
+        elif "latency" in f.name:
+            # this result is generated via `benchmark_latency.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # get different percentiles
+            for perc in [10, 25, 50, 75, 90, 99]:
+                # Multiply 1000 to convert the time unit from s to ms
+                raw_result.update(
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
+
+            # add the result to raw_result
+            latency_results.append(raw_result)
+            continue
+
+        elif "throughput" in f.name:
+            # this result is generated via `benchmark_throughput.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            throughput_results.append(raw_result)
+            continue
+
+        print(f"Skipping {test_file}")
+
+    latency_results = pd.DataFrame.from_dict(latency_results)
+    serving_results = pd.DataFrame.from_dict(serving_results)
+    throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+    raw_results_json = results_to_json(latency_results, throughput_results,
+                                       serving_results)
+
+    # remapping the key, for visualization purpose
     if not latency_results.empty:
-        f.write("## Latency tests\n")
-        f.write(latency_md_table)
-        f.write("\n")
-    if not throughput_results.empty:
-        f.write("## Throughput tests\n")
-        f.write(throughput_md_table)
-        f.write("\n")
+        latency_results = latency_results[list(
+            latency_column_mapping.keys())].rename(
+                columns=latency_column_mapping)
     if not serving_results.empty:
-        f.write("## Serving tests\n")
-        f.write(serving_md_table)
-        f.write("\n")
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+    if not throughput_results.empty:
+        throughput_results = throughput_results[list(
+            throughput_results_column_mapping.keys())].rename(
+                columns=throughput_results_column_mapping)
+
+    processed_results_json = results_to_json(latency_results,
+                                             throughput_results,
+                                             serving_results)
+
+    # get markdown tables
+    latency_md_table = tabulate(latency_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    serving_md_table = tabulate(serving_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    throughput_md_table = tabulate(throughput_results,
+                                   headers='keys',
+                                   tablefmt='pipe',
+                                   showindex=False)
+
+    # document the result
+    with open(results_folder / "benchmark_results.md", "w") as f:
+
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
+        results = results.format(
+            latency_tests_markdown_table=latency_md_table,
+            throughput_tests_markdown_table=throughput_md_table,
+            serving_tests_markdown_table=serving_md_table,
+            benchmarking_results_in_json_string=processed_results_json)
+        f.write(results)
diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md
new file mode 100644
index 000000000..891e49170
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/descriptions.md
@@ -0,0 +1,67 @@
+
+## Latency tests
+
+This test suite aims to test vllm's end-to-end latency under a controlled setup.
+
+- Input length: 32 tokens.
+- Output length: 128 tokens.
+- Batch size: fixed (8).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: end-to-end latency (mean, median, p99).
+
+### Latency benchmarking results
+
+{latency_tests_markdown_table}
+
+## Throughput tests
+
+This test suite aims to test vllm's throughput.
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm to achieve maximum throughput.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput.
+
+### Throughput benchmarking results
+
+{throughput_tests_markdown_table}
+
+## Serving tests
+
+This test suite aims to test vllm's real serving metrics.
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+
+### Serving benchmarking results
+
+{serving_tests_markdown_table}
+
+## json version of the benchmarking tables
+
+This section contains the data of the markdown tables above in JSON format. 
+You can load the benchmarking tables into pandas dataframes as follows:
+
+```python
+import json
+import pandas as pd
+
+benchmarking_results_json = """The json string"""
+benchmarking_results = json.loads(benchmarking_results_json)
+latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
+throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
+serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
+```
+
+The json string for all benchmarking tables:
+```json
+{benchmarking_results_in_json_string}
+```
+
+You can also check the raw experiment data in the Artifact tab of the Buildkite page.
+
diff --git a/.buildkite/nightly-benchmarks/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/latency-tests.json
rename to .buildkite/nightly-benchmarks/tests/latency-tests.json
index 294a8c439..06488cd79 100644
--- a/.buildkite/nightly-benchmarks/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -29,4 +29,4 @@
             "num-iters": 15
         }
     }
-]
+]
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/serving-tests.json
rename to .buildkite/nightly-benchmarks/tests/serving-tests.json
index bb6746612..86a0fefa3 100644
--- a/.buildkite/nightly-benchmarks/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -56,4 +56,4 @@
             "num_prompts": 200
         }
     }
-]
+]
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/throughput-tests.json
rename to .buildkite/nightly-benchmarks/tests/throughput-tests.json
index db4f908d7..41ac13574 100644
--- a/.buildkite/nightly-benchmarks/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -32,4 +32,4 @@
             "backend": "vllm"
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 11d1bf7a4..767afd21a 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -98,7 +98,7 @@ def main(args: argparse.Namespace):
     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
         latencies.append(run_to_completion(profile_dir=None))
     latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90]
+    percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)
     print(f'Avg latency: {np.mean(latencies)} seconds')
     for percentage, percentile in zip(percentages, percentiles):
-- 
GitLab


From 1b44aaf4e3559e4e321f32715b08f1aa7e4f3d50 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 17 Jun 2024 14:35:04 -0700
Subject: [PATCH 071/376] [bugfix][distributed] fix 16 gpus local rank
 arrangement (#5604)

---
 vllm/executor/ray_gpu_executor.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 89d1c4ac7..843332e5e 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -137,6 +137,12 @@ class RayGPUExecutor(DistributedGPUExecutor):
 
         for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
             node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
             node_gpus[node_id].extend(gpu_ids)
         for node_id, gpu_ids in node_gpus.items():
             node_gpus[node_id] = sorted(gpu_ids)
-- 
GitLab


From e441bad674e6dca076a145ca63f72100318c51e5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 17 Jun 2024 15:08:05 -0700
Subject: [PATCH 072/376] [Optimization] use a pool to reuse
 LogicalTokenBlock.token_ids (#5584)

---
 vllm/block.py | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index 2cc6b947f..e7fb29c8c 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -1,5 +1,7 @@
 """Token blocks."""
-from typing import List
+import weakref
+from collections import defaultdict
+from typing import Dict, List
 
 from vllm.utils import Device
 
@@ -7,6 +9,35 @@ _BLANK_TOKEN_ID = -1
 
 DEFAULT_LAST_ACCESSED_TIME = -1
 
+TokensBlock = List[int]
+
+
+class BlockPool:
+    """A pool of physical blocks.
+    When requests come, we create a lot of logical blocks;
+    when requests are done, we destroy a lot of logical blocks.
+    It turns out that creating and destroying logical blocks can be expensive,
+    especially for the `token_ids` field, which is a list of integers.
+    To avoid this overhead, we use a pool to manage the logical blocks.
+    When an old request is done and a new request comes, we can reuse the
+    logical blocks from the old request to feed the new request.
+    """
+
+    def __init__(self) -> None:
+        # block size to list of token blocks
+        self.pool: Dict[int, List[TokensBlock]] = defaultdict(list)
+
+    def alloc_block(self, block_size: int) -> TokensBlock:
+        if block_size in self.pool and self.pool[block_size]:
+            return self.pool[block_size].pop()
+        return [_BLANK_TOKEN_ID] * block_size
+
+    def del_block(self, block: TokensBlock) -> None:
+        self.pool[len(block)].append(block)
+
+
+_BLOCK_POOL = BlockPool()
+
 
 class LogicalTokenBlock:
     """A block that stores a contiguous chunk of tokens from left to right.
@@ -23,7 +54,13 @@ class LogicalTokenBlock:
         self.block_number = block_number
         self.block_size = block_size
 
-        self.token_ids = [_BLANK_TOKEN_ID] * block_size
+        self.token_ids = _BLOCK_POOL.alloc_block(block_size)
+        # this finalizer is used to return the block to the pool when the object is deleted # noqa
+        # NOTE: don't use __del__ because it cannot guarantee the order of finalization, # noqa
+        # i.e. `self.token_ids` may be deleted before `self`, and we lose
+        #  the opportunity to return the block to the pool
+        self._finalizer = weakref.finalize(self, _BLOCK_POOL.del_block,
+                                           self.token_ids)
         self.num_tokens = 0
 
     def is_empty(self) -> bool:
-- 
GitLab


From a3e8a05d4c1b79dd44eb92bb6f57eb40c3fbdb21 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bruce@2.7182.net>
Date: Mon, 17 Jun 2024 15:26:41 -0700
Subject: [PATCH 073/376] [Bugfix] Fix KV head calculation for MPT models when
 using GQA (#5142)

---
 vllm/config.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index b1a3a82f5..d95faf52d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -302,7 +302,11 @@ class ModelConfig:
             return 1
 
         # For DBRX and MPT
-        if self.hf_config.model_type in ["dbrx", "mpt"]:
+        if self.hf_config.model_type == "mpt":
+            if "kv_n_heads" in self.hf_config.attn_config:
+                return self.hf_config.attn_config["kv_n_heads"]
+            return self.hf_config.num_attention_heads
+        if self.hf_config.model_type == "dbrx":
             return getattr(self.hf_config.attn_config, "kv_n_heads",
                            self.hf_config.num_attention_heads)
 
-- 
GitLab


From 26e1188e51aca3b76184671d804a8b17c294b610 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Mon, 17 Jun 2024 16:16:10 -0700
Subject: [PATCH 074/376] [Fix] Use utf-8 encoding in
 entrypoints/openai/run_batch.py (#5606)

---
 vllm/entrypoints/openai/run_batch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index b0c0f4ad2..2f1870187 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -58,7 +58,7 @@ async def read_file(path_or_url: str) -> str:
                    session.get(path_or_url) as resp:
             return await resp.text()
     else:
-        with open(path_or_url, "r") as f:
+        with open(path_or_url, "r", encoding="utf-8") as f:
             return f.read()
 
 
@@ -71,7 +71,7 @@ async def write_file(path_or_url: str, data: str) -> None:
         # We should make this async, but as long as this is always run as a
         # standalone program, blocking the event loop won't effect performance
         # in this particular case.
-        with open(path_or_url, "w") as f:
+        with open(path_or_url, "w", encoding="utf-8") as f:
             f.write(data)
 
 
-- 
GitLab


From fa9e3852290ecb6eaae45befbd629bb060f57fb7 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Mon, 17 Jun 2024 19:29:09 -0700
Subject: [PATCH 075/376] [Speculative Decoding 1/2 ] Add typical acceptance
 sampling as one of the sampling techniques in the verifier (#5131)

---
 .../test_typical_acceptance_sampler.py        | 464 ++++++++++++++++++
 .../layers/rejection_sampler.py               | 174 +------
 .../layers/spec_decode_base_sampler.py        | 206 ++++++++
 .../layers/typical_acceptance_sampler.py      | 186 +++++++
 4 files changed, 866 insertions(+), 164 deletions(-)
 create mode 100644 tests/samplers/test_typical_acceptance_sampler.py
 create mode 100644 vllm/model_executor/layers/spec_decode_base_sampler.py
 create mode 100644 vllm/model_executor/layers/typical_acceptance_sampler.py

diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
new file mode 100644
index 000000000..87cf37bc9
--- /dev/null
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -0,0 +1,464 @@
+"""Tests for rejection sampling."""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.typical_acceptance_sampler import (
+    TypicalAcceptanceSampler)
+from vllm.model_executor.utils import set_random_seed
+
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
+
+
+def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
+    """
+    Generates a fake temperature zero probability distribution.
+    Returns:
+        1. A fake temperature zero probability distribution of shape
+           [batch_size, k, vocab_size]
+        2. Tensor of shape [batch_size, k] containing the token ids 
+           of the probability 1.0 tokens at each position.
+    """
+    # Simulate temperature 0 probability distribution for target probabilities
+    # and create target probabilities such that only 1 token id has
+    # probability 1.0
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    probs = torch.rand(batch_size, k, vocab_size)
+    _, zero_temperature_token_ids = torch.max(probs, dim=-1)
+    # set the probability of the tokens with ids in zero_temperature_token_ids
+    # to 1 and the rest to 0.
+    target_probs = torch.zeros_like(probs).scatter_(
+        -1, zero_temperature_token_ids.unsqueeze(-1), 1.0)
+    return target_probs, zero_temperature_token_ids
+
+
+def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
+                        token_ids_to_exclude: torch.Tensor):
+    """
+    Returns a tensor of shape [batch_size, k] of fake draft token ids
+    drawn randomly from a vocab of size vocab_size. We however ensure
+    that token_ids from token_ids_to_exclude are excluded at the 
+    corresponding positions.
+    """
+    draft_token_ids = torch.empty(batch_size, k, dtype=torch.long)
+    for i in range(batch_size):
+        for j in range(k):
+            # Generate a random token ID excluding token_ids_to_exclude[i, j]
+            while True:
+                token_id = torch.randint(0, vocab_size, (1, )).item()
+                if token_id != token_ids_to_exclude[i, j]:
+                    draft_token_ids[i, j] = token_id
+                    break
+    return draft_token_ids
+
+
+@pytest.mark.parametrize("k", list(range(1, 6)))
+@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+@pytest.mark.parametrize("batch_size", list(range(1, 32)))
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
+                                    device: str):
+    """
+    Tests that the TypicalAcceptancSampler forward succeeds for
+    different combinations of k, vocab_size, batch_size and num devices.
+    """
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler()
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+    # Verify that sampling succeeds for all cases.
+    typical_acceptance_sampler(target_probs, bonus_token_ids, draft_token_ids)
+
+
+@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
+@pytest.mark.parametrize("which_token_ids",
+                         ["bonus_token_ids", "draft_token_ids"])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
+                               which_token_ids: str, device: str):
+    """
+    Tests that we throw an exception of the token ids fall outside
+    the bound of the provided vocabulary.
+    """
+    k = 3
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(strict_mode=True)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+    # Verify that appropriate exceptions are thrown for out
+    # of bound vocabs.
+    oob_token_ids = None
+    if which_token_ids == "bonus_token_ids":
+        oob_token_ids = bonus_token_ids
+    elif which_token_ids == "draft_token_ids":
+        oob_token_ids = draft_token_ids
+    else:
+        raise AssertionError()
+
+    if above_or_below_vocab_range == "above":
+        rogue_token_id = vocab_size + 1
+    elif above_or_below_vocab_range == "below":
+        rogue_token_id = -1
+    else:
+        raise AssertionError()
+
+    oob_token_ids[0][0] = rogue_token_id
+
+    with pytest.raises(AssertionError):
+        typical_acceptance_sampler(target_probs, bonus_token_ids,
+                                   draft_token_ids)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_uniform_target_distribution_accepts_all_tokens(
+        seed: int, disable_bonus_tokens: bool, device: str):
+    """
+     Test the TypicalAcceptanceSampler with a uniform target probability 
+     distribution.
+    
+    This test verifies that when provided with a uniform target probability
+    distribution, the TypicalAcceptanceSampler accepts all draft tokens. The
+    entropy of the uniform target distribution being high should lead to all
+    draft tokens being accepted. The test also ensures that the behavior
+    regarding bonus tokens is consistent with the `disable_bonus_tokens`
+    flag.
+    """
+    set_random_seed(seed)
+    k = 3
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    # We are using a uniform target probability distribution.
+    # For a uniform distribution the entropy is very high and it
+    # should lead to all draft tokens being accepted. Verify that.
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    if disable_bonus_tokens:
+        assert torch.all(output_token_ids[:, -1] == -1)
+    else:
+        assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze())
+
+    assert torch.all(output_token_ids[:, :k] == draft_token_ids)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_temperature_zero_target_distribution(seed: int,
+                                              disable_bonus_tokens: bool,
+                                              device: str):
+    """
+    Test the TypicalAcceptanceSampler with a zero-temperature target
+    probability distribution.
+
+    This test verifies that when using a zero-temperature target probability
+    distribution, where only one token has a probability of 1.0, the
+    TypicalAcceptanceSampler correctly rejects all draft tokens that do not
+    match this probability. Additionally, it ensures that when all draft
+    tokens are rejected, the sampler falls back to greedy sampling to select a
+    single token from the target distribution.
+    """
+    set_random_seed(seed)
+    k = 3
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    # Simulate temperature 0 probability distribution for target probabilities
+    # and create target probabilities such that only 1 token id has
+    # probability 1.0
+    target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist(
+        batch_size, k, vocab_size)
+    # Populate draft_token_ids such that they exclude the token_ids
+    # with probability = 1.0
+    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
+                                          zero_temperature_token_ids)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    # The target probaility distribution is a temperature zero distribution
+    # with zero entroy. Since our draft token ids don't match the probability
+    # 1.0 tokens in the target distribution we will reject all of them and
+    # fallback to the greedy sampling for selecting 1 token for each sequence.
+    # Verify the same.
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, -1] == -1)
+    assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:,
+                                                                          0])
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
+                                   device: str):
+    """
+    Test the TypicalAcceptanceSampler with a mixed target probability
+    distribution.
+
+    This test ensures that the TypicalAcceptanceSampler handles a mixed
+    target probability distribution correctly. Specifically, it uses a 
+    zero-temperature distribution for some sequences and a uniform
+    distribution for others. The test verifies that:
+    
+    - For sequences with a zero-temperature distribution, only the token
+    with a probability of 1.0 is accepted, and all other tokens are rejected.
+    - For sequences with a uniform distribution, all draft tokens are
+    accepted.
+    - When `disable_bonus_tokens` is False, the bonus tokens are also accepted
+    for sequences with a uniform distribution.
+    """
+    set_random_seed(seed)
+    k = 3
+    batch_size = 4
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    # For sequences 0 and 2 set the distribution to a temperature
+    # zero distribution. For sequences 1 and 3 set it to a uniform
+    # distribution.
+    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
+        batch_size, k, vocab_size))
+    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
+                                          zero_temperature_token_ids)
+    uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32)
+    target_probs[[1, 3]] = uniform_probs
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    # verify the shape of output_token_ids
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    # For sequences 0 and 2 verify that only 1 token is accepted
+    # which is the token with probability 1.0 in the target distribution
+    # at position 0.
+    assert torch.all(output_token_ids[[0, 2], 1:] == -1)
+    assert (torch.all(output_token_ids[[0, 2],
+                                       0] == zero_temperature_token_ids[[0, 2],
+                                                                        0]))
+    # For sequences 1 and 3 verify that all tokens are accepted since the
+    # target probability distribution is uniform. In addition verify that
+    # if disable_bonus_tokens is false then we also accept the bonus tokens.
+    assert torch.all(
+        output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :])
+    if disable_bonus_tokens:
+        assert torch.all(output_token_ids[[1, 3], -1] == -1)
+    else:
+        assert torch.all(output_token_ids[[1, 3], -1] != -1)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
+                                 device: str):
+    """
+    Test the TypicalAcceptanceSampler's behavior when only a subset of draft
+    tokens should be accepted.
+
+    This test verifies that the TypicalAcceptanceSampler correctly accepts or
+    rejects draft tokens based on a zero-temperature target probability
+    distribution. Specifically, it ensures that:
+    
+    - When all draft tokens match tokens with a probability of 1.0 in the
+    target distribution, all draft tokens are accepted.
+    - When only some draft tokens match tokens with a probability of 1.0 in
+    the target distribution, only those matching tokens are accepted, and the
+    rest are rejected.
+    """
+    set_random_seed(seed)
+    k = 5
+    batch_size = 1
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    # Create a temperature zero target probability distribution and ensure
+    # all draft token ids correspond to the tokens with 1.0 probability.
+    # Verify that all of them are accepted.
+    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
+        batch_size, k, vocab_size))
+    draft_token_ids = zero_temperature_token_ids
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
+    if disable_bonus_tokens:
+        assert torch.all(output_token_ids[:, -1] == -1)
+    else:
+        assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
+    # Next only keep the first 2 draft tokens same as the zero temperature
+    # tokens. For the remaining 3 choose some other tokens. In the
+    # response we will expect the first 2 tokens to be the same as the
+    # draft tokens and the rest as -1
+    draft_token_ids_to_replace = get_draft_token_ids(
+        batch_size, k, vocab_size, zero_temperature_token_ids)
+    draft_token_ids = torch.cat(
+        (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
+    assert torch.all(output_token_ids[:, -3:] == -1)
+
+
+@pytest.mark.parametrize("seed", list(range(1)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_accept_tokens_set_non_default_posteriors(seed: int,
+                                                  disable_bonus_tokens: bool,
+                                                  device: str):
+    """
+    Test the TypicalAcceptanceSampler with custom posterior thresholds and 
+    alpha values. This test verifies that by modifying the posterior
+    thresholds and alpha values we can change the acceptance behavior of the
+    sampler. 
+    """
+    set_random_seed(seed)
+    k = 5
+    batch_size = 1
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    # Simulate temperature 0 probability distribution for target
+    # probabilities and create target probabilities such that only 1 token
+    # id has probability 1.0 and others have a very low probability of
+    # 0.00001. Populate draft_token_ids such that they exclude the token_ids
+    # with probability = 1.0. Without any changes to the posterior thresholds
+    # none of the draft tokens are accepted.
+    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
+        batch_size, k, vocab_size))
+    target_probs[target_probs == 0] = 0.00001
+    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
+                                          zero_temperature_token_ids)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, 1:-1] == -1)
+
+    # Change the posterior threshold values to 0.0 so that we will
+    # now accept even draft tokens with very low probability in the
+    # target distribution. Simulate and verify the same.
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True,
+        disable_bonus_tokens=disable_bonus_tokens,
+        posterior_threshold=0.0,
+        posterior_alpha=0.0)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    output_token_ids = typical_acceptance_sampler(target_probs,
+                                                  bonus_token_ids,
+                                                  draft_token_ids)
+    assert output_token_ids.shape[0] == batch_size
+    assert output_token_ids.shape[1] == (k + 1)
+    assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
+    if disable_bonus_tokens:
+        assert torch.all(output_token_ids[:, -1] == -1)
+    else:
+        assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool,
+                               device: str):
+    """
+    Test the TypicalAcceptanceSampler's method for generating
+    replacement token IDs.
+
+    This test verifies that the `_replacement_token_ids` method of the 
+    TypicalAcceptanceSampler correctly identifies the token IDs to be used
+    as replacements based on the target probability distribution.
+    Specifically, it ensures that the method correctly identifies the
+    tokens with the highest probability for each sequence in the batch.
+    """
+    set_random_seed(seed)
+    k = 10
+    batch_size = 5
+    vocab_size = 30_000
+    torch.set_default_device(device)
+    typical_acceptance_sampler = TypicalAcceptanceSampler(
+        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
+    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    expected_replacement_tokens = -torch.ones(
+        (batch_size, k), dtype=torch.long)
+    expected_replacement_tokens[:, 0] = torch.argmax(target_probs[:, 0, :],
+                                                     dim=1)
+    actual_replacement_tokens = (
+        typical_acceptance_sampler._replacement_token_ids(target_probs))
+    assert torch.all(expected_replacement_tokens == actual_replacement_tokens)
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index a80703155..fe9b2fac1 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,12 +1,15 @@
 from functools import cached_property
-from typing import Optional, Tuple
+from typing import Tuple
 
 import torch
 import torch.jit
 import torch.nn as nn
 
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
 
-class RejectionSampler(nn.Module):
+
+class RejectionSampler(SpecDecodeBaseSampler, nn.Module):
     """Apply modified rejection sampling as described in "Accelerating Large
         Language Model Decoding with Speculative Sampling"
         https://arxiv.org/pdf/2302.01318.pdf.
@@ -22,39 +25,11 @@ class RejectionSampler(nn.Module):
             Require when bonus tokens will cause corrupt KV cache for
             proposal methods that require KV cache.
             strict_mode: Whether or not to perform shape/device/dtype checks
-                during sampling. This catches correctness issues but adds
-                nontrivial latency.
+            during sampling. This catches correctness issues but adds
+            nontrivial latency.
         """
-        super().__init__()
-        self._disable_bonus_tokens = disable_bonus_tokens
-        self._strict_mode = strict_mode
-
-        # NOTE: A "bonus token" is accepted iff all proposal tokens are
-        # accepted. There is always only one possible bonus token. We store this
-        # value in a variable for readability.
-        self._num_bonus_tokens = 1
-
-        self.num_accepted_tokens: Optional[torch.Tensor] = None
-        self.num_emitted_tokens: Optional[torch.Tensor] = None
-        self.num_draft_tokens: int = 0
-
-    def init_gpu_tensors(self, rank: int) -> None:
-        assert self.num_accepted_tokens is None
-        device = f"cuda:{rank}"
-        self.num_accepted_tokens = torch.tensor(0,
-                                                dtype=torch.long,
-                                                device=device)
-        self.num_emitted_tokens = torch.tensor(0,
-                                               dtype=torch.long,
-                                               device=device)
-
-    @property
-    def probs_dtype(self):
-        return torch.float32
-
-    @property
-    def token_id_dtype(self):
-        return torch.int64
+        SpecDecodeBaseSampler.__init__(self, disable_bonus_tokens, strict_mode)
+        nn.Module.__init__(self)
 
     def forward(
         self,
@@ -100,15 +75,8 @@ class RejectionSampler(nn.Module):
         # Only perform shape/dtype/device checking in strict mode, as it adds
         # overhead.
         if self._strict_mode:
-            self._raise_if_incorrect_shape(target_probs, bonus_token_ids,
-                                           draft_probs, draft_token_ids)
-            self._raise_if_incorrect_dtype(target_probs, bonus_token_ids,
+            self._raise_if_incorrect_input(target_probs, bonus_token_ids,
                                            draft_probs, draft_token_ids)
-            self._raise_if_inconsistent_device(target_probs, bonus_token_ids,
-                                               draft_probs, draft_token_ids)
-            self._raise_if_out_of_bounds_vocab(target_probs.shape[-1],
-                                               bonus_token_ids,
-                                               draft_token_ids)
 
         accepted, recovered_token_ids = self._batch_modified_rejection_sampling(
             target_probs,
@@ -272,128 +240,6 @@ class RejectionSampler(nn.Module):
         """
         return torch.finfo(self.probs_dtype).tiny
 
-    def _create_output(
-            self,
-            accepted: torch.Tensor,  # [batch_size, k]
-            recovered_token_ids: torch.Tensor,  # [batch_size, k]
-            draft_token_ids: torch.Tensor,  # [batch_size, k]
-            bonus_token_ids: torch.Tensor,  # [batch_size]
-    ) -> torch.Tensor:
-        """Format output. Returns a matrix of token ids. When
-        a token is rejected via rejection sampling, all subsequent
-        token ids are set to -1 for the sequence.
-
-        shape = [batch_size, k + num_bonus_tokens]
-        """
-        bonus_token_ids = bonus_token_ids.squeeze()
-        batch_size, k = recovered_token_ids.shape
-
-        # Determine the index of the first False value for each row.
-        limits = (accepted == 0).max(1).indices
-        limits[~(accepted == 0).any(1)] = k
-
-        # Create masks using the indices.
-        indices = torch.arange(k, device=accepted.device).unsqueeze(0)
-        accepted_mask = indices < limits.unsqueeze(1)
-        after_false_mask = indices == limits.unsqueeze(1)
-
-        # Create an extended output tensor
-        output_with_bonus_tokens = -torch.ones(
-            (batch_size, k + self._num_bonus_tokens),
-            dtype=self.token_id_dtype,
-            device=accepted.device)
-        output = output_with_bonus_tokens[:, :k]
-
-        # Fill in the first k columns of the output tensor using masks and data
-        # tensors.
-        torch.where(accepted_mask,
-                    draft_token_ids,
-                    -torch.ones_like(draft_token_ids),
-                    out=output)
-
-        # Fill the last column.
-        # We check output directly as accepted may have True values inconsistent
-        # with causal acceptance.
-        output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1,
-                                                      bonus_token_ids, -1)
-
-        # We disable bonus tokens because it causes corrupt KV cache for
-        # proposal methods that require KV cache. We can fix it by "prefilling"
-        # the bonus token in the proposer. The following issue tracks the fix.
-        # https://github.com/vllm-project/vllm/issues/4212
-        if self._disable_bonus_tokens:
-            output_with_bonus_tokens[:, -1] = -1
-
-        # Fill the recovered token ids.
-        output.mul_(~after_false_mask).add_(
-            recovered_token_ids.mul(after_false_mask))
-
-        self.num_accepted_tokens += accepted.sum()
-        self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum()
-        self.num_draft_tokens += batch_size * k
-
-        return output_with_bonus_tokens
-
-    def _raise_if_incorrect_shape(
-        self,
-        target_probs: torch.Tensor,
-        bonus_token_ids: torch.Tensor,
-        draft_probs: torch.Tensor,
-        draft_token_ids: torch.Tensor,
-    ) -> None:
-        (target_batch_size, num_target_probs,
-         target_vocab_size) = target_probs.shape
-        bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape
-        draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape
-        draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape
-
-        assert draft_batch_size == target_batch_size
-        assert num_draft_probs == num_target_probs
-        assert (draft_vocab_size == target_vocab_size
-                ), f"{draft_vocab_size=} {target_vocab_size=}"
-
-        assert draft_token_ids_batch_size == draft_batch_size
-        assert num_draft_token_ids == num_draft_probs
-
-        assert bonus_batch_size == target_batch_size
-        assert num_bonus_tokens == self._num_bonus_tokens
-
-    def _raise_if_incorrect_dtype(
-        self,
-        target_probs: torch.Tensor,
-        bonus_token_ids: torch.Tensor,
-        draft_probs: torch.Tensor,
-        draft_token_ids: torch.Tensor,
-    ) -> None:
-        assert all(probs.dtype == self.probs_dtype
-                   for probs in [target_probs, draft_probs])
-        assert all(token_ids.dtype == self.token_id_dtype
-                   for token_ids in [bonus_token_ids, draft_token_ids])
-
-    def _raise_if_inconsistent_device(
-        self,
-        target_probs: torch.Tensor,
-        bonus_token_ids: torch.Tensor,
-        draft_probs: torch.Tensor,
-        draft_token_ids: torch.Tensor,
-    ) -> None:
-        devices = [
-            t.device for t in
-            [target_probs, bonus_token_ids, draft_probs, draft_token_ids]
-        ]
-        assert all([devices[0] == device for device in devices])
-
-    def _raise_if_out_of_bounds_vocab(
-        self,
-        vocab_size: int,
-        bonus_token_ids: torch.Tensor,
-        draft_token_ids: torch.Tensor,
-    ) -> None:
-        assert torch.all(bonus_token_ids < vocab_size)
-        assert torch.all(bonus_token_ids >= 0)
-        assert torch.all(draft_token_ids < vocab_size)
-        assert torch.all(draft_token_ids >= 0)
-
 
 # torch.multinomial forces a GPU<->CPU sync.
 # Therefore, we use an optimized implementation instead that skips the sync.
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
new file mode 100644
index 000000000..9856a7e7d
--- /dev/null
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -0,0 +1,206 @@
+from typing import Optional
+
+import torch
+
+
+class SpecDecodeBaseSampler():
+    """Base class for samplers used for Speculative Decoding verification
+        step.
+    """
+
+    def __init__(self,
+                 disable_bonus_tokens: bool = True,
+                 strict_mode: bool = False):
+        """Base class constructor.
+        Args:
+            disable_bonus_tokens: Whether or not to disable the bonus token.
+            Require when bonus tokens will cause corrupt KV cache for
+            proposal methods that require KV cache.
+            strict_mode: Whether or not to perform shape/device/dtype checks
+                during sampling. This catches correctness issues but adds
+                nontrivial latency.
+        """
+        super().__init__()
+        self._disable_bonus_tokens = disable_bonus_tokens
+        self._strict_mode = strict_mode
+
+        # NOTE: A "bonus token" is accepted iff all proposal tokens are
+        # accepted. There is always only one possible bonus token. We store this
+        # value in a variable for readability.
+        self._num_bonus_tokens = 1
+
+        self.num_accepted_tokens: Optional[torch.Tensor] = None
+        self.num_emitted_tokens: Optional[torch.Tensor] = None
+        self.num_draft_tokens: int = 0
+
+    def init_gpu_tensors(self, rank: int) -> None:
+        assert self.num_accepted_tokens is None
+        device = f"cuda:{rank}"
+        self.num_accepted_tokens = torch.tensor(0,
+                                                dtype=torch.long,
+                                                device=device)
+        self.num_emitted_tokens = torch.tensor(0,
+                                               dtype=torch.long,
+                                               device=device)
+
+    @property
+    def probs_dtype(self):
+        return torch.float32
+
+    @property
+    def token_id_dtype(self):
+        return torch.int64
+
+    def _create_output(
+            self,
+            accepted: torch.Tensor,  # [batch_size, k]
+            substitute_token_ids: torch.Tensor,  # [batch_size, k]
+            draft_token_ids: torch.Tensor,  # [batch_size, k]
+            bonus_token_ids: torch.Tensor,  # [batch_size]
+    ) -> torch.Tensor:
+        """Format output. Returns a matrix of token ids. When
+        a token is rejected via sampling, all subsequent token ids are 
+        set to -1 for the sequence.
+
+        Args:
+            accepted: A boolean tensor indicating if the corresponding
+            draft token in draft_token_ids should be accepted or not.
+            substitute_token_ids: A tensor of token_ids that can be used
+            as substitutes for the draft token ids if the proposed token
+            is rejected.
+            draft_token_ids: A tensor of token ids speculated by the 
+            draft model.
+            bonus_token_ids: Token ids to use as the bonus token if
+            all the draft tokens are accepted.
+        Returns:
+            A tensor containing the accepted token ids. The shape of the 
+            tensor is [batch_size, k + num_bonus_tokens]
+        """
+        batch_size, k = substitute_token_ids.shape
+        bonus_token_ids = bonus_token_ids.squeeze()
+        # Determine the index of the first False value for each row.
+        limits = (accepted == 0).max(1).indices
+        limits[~(accepted == 0).any(1)] = k
+
+        # Create masks using the indices.
+        indices = torch.arange(k, device=accepted.device).unsqueeze(0)
+        accepted_mask = indices < limits.unsqueeze(1)
+        after_false_mask = indices == limits.unsqueeze(1)
+
+        # Create an extended output tensor
+        output_with_bonus_tokens = -torch.ones(
+            (batch_size, k + self._num_bonus_tokens),
+            dtype=self.token_id_dtype,
+            device=accepted.device)
+        output = output_with_bonus_tokens[:, :k]
+
+        # Fill in the first k columns of the output tensor using masks and data
+        # tensors.
+        output[:, :k] = torch.where(accepted_mask, draft_token_ids,
+                                    -torch.ones_like(draft_token_ids))
+
+        # Fill the last column.
+        # We check output directly as accepted may have True values inconsistent
+        # with causal acceptance.
+        output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1,
+                                                      bonus_token_ids, -1)
+
+        # We disable bonus tokens because it causes corrupt KV cache for
+        # proposal methods that require KV cache. We can fix it by "prefilling"
+        # the bonus token in the proposer. The following issue tracks the fix.
+        # https://github.com/vllm-project/vllm/issues/4212
+        if self._disable_bonus_tokens:
+            output_with_bonus_tokens[:, -1] = -1
+
+        # Fill the recovered token ids.
+        output.mul_(~after_false_mask).add_(
+            substitute_token_ids.mul(after_false_mask))
+
+        self.num_accepted_tokens += accepted.sum()
+        self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum()
+        self.num_draft_tokens += batch_size * k
+
+        return output_with_bonus_tokens
+
+    def _raise_if_incorrect_input(
+        self,
+        target_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: Optional[torch.Tensor] = None,
+    ) -> None:
+        self._raise_if_incorrect_shape(target_probs, draft_token_ids,
+                                       bonus_token_ids, draft_probs)
+        self._raise_if_incorrect_dtype(target_probs, draft_token_ids,
+                                       bonus_token_ids, draft_probs)
+        self._raise_if_inconsistent_device(target_probs, draft_token_ids,
+                                           bonus_token_ids, draft_probs)
+        self._raise_if_out_of_bounds_vocab(target_probs.shape[-1],
+                                           draft_token_ids, bonus_token_ids)
+
+    def _raise_if_incorrect_shape(
+        self,
+        target_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: Optional[torch.Tensor] = None,
+    ) -> None:
+        (target_batch_size, num_target_probs,
+         target_vocab_size) = target_probs.shape
+
+        # validate the shape of draft token ids.
+        draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape
+        assert draft_token_ids_batch_size == target_batch_size
+        assert num_draft_token_ids == num_target_probs
+
+        # validate the shape of bonus token ids
+        bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape
+        assert bonus_batch_size == target_batch_size
+        assert num_bonus_tokens == self._num_bonus_tokens
+
+        # validate the shape of draft probs if it is set
+        if draft_probs is not None:
+            (draft_batch_size, num_draft_probs,
+             draft_vocab_size) = draft_probs.shape
+            assert draft_batch_size == target_batch_size
+            assert num_draft_probs == num_target_probs
+            assert (draft_vocab_size == target_vocab_size
+                    ), f"{draft_vocab_size=} {target_vocab_size=}"
+
+    def _raise_if_incorrect_dtype(
+        self,
+        target_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: Optional[torch.Tensor] = None,
+    ) -> None:
+        assert target_probs.dtype == self.probs_dtype
+        assert draft_token_ids.dtype == self.token_id_dtype
+        assert bonus_token_ids.dtype == self.token_id_dtype
+        if draft_probs is not None:
+            assert draft_probs.dtype == self.probs_dtype
+
+    def _raise_if_inconsistent_device(
+        self,
+        target_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: Optional[torch.Tensor] = None,
+    ) -> None:
+        devices = [
+            t.device for t in
+            [target_probs, bonus_token_ids, draft_probs, draft_token_ids]
+            if t is not None
+        ]
+        assert all([devices[0] == device for device in devices])
+
+    def _raise_if_out_of_bounds_vocab(
+        self,
+        vocab_size: int,
+        draft_token_ids: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+    ) -> None:
+        assert torch.all(bonus_token_ids < vocab_size)
+        assert torch.all(bonus_token_ids >= 0)
+        assert torch.all(draft_token_ids < vocab_size)
+        assert torch.all(draft_token_ids >= 0)
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
new file mode 100644
index 000000000..f12d6a03b
--- /dev/null
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -0,0 +1,186 @@
+import torch
+import torch.jit
+import torch.nn as nn
+
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
+
+
+class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
+    """Apply typical acceptance sampling as described in section 3.3.1 in 
+        "MEDUSA: Simple LLM Inference Acceleration Framework with 
+        Multiple Decoding Heads"
+        https://arxiv.org/pdf/2401.10774
+    """
+
+    def __init__(
+        self,
+        disable_bonus_tokens: bool = False,
+        strict_mode: bool = False,
+        posterior_threshold: float = 0.09,
+        posterior_alpha: float = 0.3,
+    ):
+        """Create a Typical Acceptance Sampler.
+
+        Args:
+            disable_bonus_tokens: Whether or not to disable the bonus token.
+            Require when bonus tokens will cause corrupt KV cache for
+            proposal methods that require KV cache.
+            strict_mode: Whether or not to perform shape/device/dtype checks
+            during sampling. This catches correctness issues but adds
+            nontrivial latency.
+            posterior_threshold : A threshold value that sets a lower bound 
+            on the posterior probability of a token in target model for it
+            to be accepted. Default is 0.09
+            posterior_alpha : A scaling factor for the entropy-based
+            threshold in typical acceptance sampling. Typically defaults to
+            sqrt of posterior_threshold and is set to 0.3.
+        """
+        SpecDecodeBaseSampler.__init__(
+            self,
+            disable_bonus_tokens=disable_bonus_tokens,
+            strict_mode=strict_mode)
+        nn.Module.__init__(self)
+        self._posterior_threshold = posterior_threshold
+        self._posterior_alpha = posterior_alpha
+
+    def forward(
+        self,
+        target_probs: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sample token ids using typical acceptance sampling. This accepts 
+        or rejects tokens proposed by the draft model using the probability
+        of each token according to the draft and target models.
+
+        In the worst case where all draft tokens are rejected, it is guaranteed
+        one token will be emitted.
+
+        In the case where all draft tokens are accepted, the bonus token will be
+        accepted conditioned on self._disable_bonus_tokens being false.
+
+        Args:
+            target_probs: The probability distribution over token ids given
+                context according to the target model.
+            shape = [batch_size, num_speculative_tokens, vocab_size]
+
+            bonus_token_ids: The "bonus" token ids that are accepted iff all
+                speculative tokens in a sequence are accepted.
+            shape = [batch_size, num_bonus_tokens]
+
+            draft_token_ids: The token ids that were sampled from the draft
+                probabilities.
+            shape = [batch_size, num_speculative_tokens]
+
+        Returns:
+            output_token_ids: The token ids sampled via rejection sampling,
+                or -1 if unable to sample a token because the previous token
+                was rejected.
+            shape = [batch_size, num_speculative_tokens + num_bonus_tokens]
+        """
+        # Only perform shape/dtype/device checking in strict mode, as it adds
+        # overhead.
+        if self._strict_mode:
+            self._raise_if_incorrect_input(target_probs, draft_token_ids,
+                                           bonus_token_ids)
+        accepted = self._evaluate_accepted_tokens(target_probs,
+                                                  draft_token_ids)
+        recovered_token_ids = self._replacement_token_ids(target_probs)
+        output_token_ids = self._create_output(accepted, recovered_token_ids,
+                                               draft_token_ids,
+                                               bonus_token_ids)
+        return output_token_ids
+
+    def _evaluate_accepted_tokens(self, target_probs, draft_token_ids):
+        r"""
+        Evaluates and returns a mask of accepted tokens based on the
+        posterior probabilities.
+
+        Parameters:
+        ----------
+        target_probs : torch.Tensor
+            A tensor of shape (batch_size, k, vocab_size) representing 
+            the probabilities of each token in the vocabulary for each
+            position in the proposed sequence. This is the distribution
+            generated by the target model.
+        draft_token_ids : torch.Tensor
+            A tensor of shape (batch_size, k) representing the proposed
+            token ids.
+
+        A draft token_id x_{n+k} is accepted if it satisfies the
+        following condition
+    
+        .. math::
+            p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > 
+            \min \left( \epsilon, \delta * \exp \left(
+                -H(p_{\text{original}}(
+                    \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
+        
+        where :math:`p_{\text{original}}` corresponds to target_probs 
+        and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters
+        specified using self._posterior_threshold and self._posterior_alpha
+
+        This method computes the posterior probabilities for the given
+        draft token ids based on the provided target probabilities. It
+        calculates the entropy of the posterior distribution and determines
+        a dynamic threshold for each token position using the provided
+        posterior_threshold and posterior_alpha values. The method then
+        returns a boolean mask indicating which tokens can be accepted.
+
+        Returns:
+        -------
+        torch.Tensor
+            A boolean tensor of shape (batch_size, k) where each element
+            indicates whether the corresponding draft token has been accepted
+            or rejected. True indicates acceptance and false indicates
+            rejection.
+            
+        """
+        device = target_probs.device
+        candidates_prob = torch.gather(
+            target_probs, dim=-1,
+            index=draft_token_ids.unsqueeze(-1)).squeeze(-1)
+        # A small constant added to prevent computing the logarithm of zero,
+        # which can lead to undefined values.
+        epsilon = 1e-5
+        posterior_entropy = -torch.sum(
+            target_probs * torch.log(target_probs + epsilon), dim=-1)
+        threshold = torch.minimum(
+            torch.ones_like(posterior_entropy, device=device) *
+            self._posterior_threshold,
+            torch.exp(-posterior_entropy) * self._posterior_alpha,
+        )
+        accepted_mask = candidates_prob > threshold
+        return accepted_mask
+
+    def _replacement_token_ids(self, target_probs):
+        """
+        Generate one replacement token ID for each sequence based on target
+        probabilities. The replacement token is used as the fallback option
+        if typical acceptance sampling does not accept any draft tokens for
+        that particular sequence. 
+
+        This method computes the token IDs to be replaced by selecting the
+        token with the highest probability for each sequence in the first 
+        position. The rest of the output is filled with -1. 
+
+        Parameters
+        ----------
+        target_probs : torch.Tensor
+            A tensor of shape (batch_size, k, vocab_size) containing 
+            the target probability distribution
+
+        Returns
+        -------
+        torch.Tensor
+            A tensor of shape (batch_size, k) with the replacement 
+            token IDs. Only the first column is set, and the rest of the
+            columns are filled with -1.
+        """
+        max_indices = torch.argmax(target_probs[:, 0, :], dim=1)
+        output = -torch.ones((target_probs.shape[0], target_probs.shape[1]),
+                             dtype=self.token_id_dtype,
+                             device=target_probs.device)
+        output[:, 0] = max_indices
+        return output
-- 
GitLab


From daef218b5595a8c744ee143223f4f0544619ea9f Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 18 Jun 2024 10:34:33 +0800
Subject: [PATCH 076/376] [Model] Initialize Phi-3-vision support (#4986)

---
 docs/source/models/supported_models.rst |   4 +
 examples/phi3v_example.py               |  57 ++++
 requirements-test.txt                   |   1 +
 tests/conftest.py                       |   3 +
 tests/models/test_phi3v.py              | 124 ++++++++
 vllm/model_executor/models/__init__.py  |   1 +
 vllm/model_executor/models/phi3v.py     | 379 ++++++++++++++++++++++++
 vllm/multimodal/utils.py                |   2 +
 8 files changed, 571 insertions(+)
 create mode 100644 examples/phi3v_example.py
 create mode 100644 tests/models/test_phi3v.py
 create mode 100644 vllm/model_executor/models/phi3v.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 5d3f55be1..f4673dc27 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -135,6 +135,10 @@ Alongside each architecture, we include some popular models that use it.
     - Phi-3-Small
     - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
     -
+  * - :code:`Phi3VForCausalLM`
+    - Phi-3-Vision
+    - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
+    -
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
new file mode 100644
index 000000000..d5e60ae1e
--- /dev/null
+++ b/examples/phi3v_example.py
@@ -0,0 +1,57 @@
+import os
+import subprocess
+
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+from vllm.multimodal.image import ImagePixelData
+
+
+def run_phi3v():
+    model_path = "microsoft/Phi-3-vision-128k-instruct"
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=4096,
+        image_input_type="pixel_values",
+        image_token_id=32044,
+        image_input_shape="1,3,1008,1344",
+        image_feature_size=1921,
+        disable_image_processor=False,
+    )
+
+    image = Image.open("images/cherry_blossom.jpg")
+
+    # single-image prompt
+    prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n"  # noqa: E501
+    prompt = prompt.replace("<|image_1|>", "<|image|>" * 1921 + "<s>")
+
+    sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "sampling_params": sampling_params,
+        "multi_modal_data": ImagePixelData(image),
+    })
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
+    local_directory = "images"
+
+    # Make sure the local directory exists or create it
+    os.makedirs(local_directory, exist_ok=True)
+
+    # Use AWS CLI to sync the directory, assume anonymous access
+    subprocess.check_call([
+        "aws",
+        "s3",
+        "sync",
+        s3_bucket_path,
+        local_directory,
+        "--no-sign-request",
+    ])
+    run_phi3v()
diff --git a/requirements-test.txt b/requirements-test.txt
index 8b68e0e93..fef0ede7b 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,6 +14,7 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
+torchvision # required for the image processor of phi3v
 
 # Benchmarking
 aiohttp
diff --git a/tests/conftest.py b/tests/conftest.py
index 18aea3702..f37c9883f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -144,6 +144,7 @@ class HfRunner:
         model_name: str,
         dtype: str = "half",
         *,
+        model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
     ) -> None:
@@ -166,11 +167,13 @@ class HfRunner:
             else:
                 auto_cls = AutoModelForCausalLM
 
+            model_kwargs = model_kwargs if model_kwargs is not None else {}
             self.model = self.wrap_device(
                 auto_cls.from_pretrained(
                     model_name,
                     torch_dtype=torch_dtype,
                     trust_remote_code=True,
+                    **model_kwargs,
                 ))
 
         self.tokenizer = AutoTokenizer.from_pretrained(
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
new file mode 100644
index 000000000..607ad95e8
--- /dev/null
+++ b/tests/models/test_phi3v.py
@@ -0,0 +1,124 @@
+from typing import List, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.config import VisionLanguageConfig
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_FILES
+
+pytestmark = pytest.mark.llava
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
+]
+
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+
+
+def iter_phi3v_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (1008, 1344): 1921,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=32044,
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+
+
+model_and_vl_config = [
+    *iter_phi3v_configs("microsoft/Phi-3-vision-128k-instruct"),
+]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image><image>bla" to "bla".
+    """
+    input_ids, output_str = vllm_output
+    image_token_id = vlm_config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
+
+    hf_input_ids = [
+        input_id if input_id != image_token_id else 0
+        for idx, input_id in enumerate(input_ids)
+    ]
+    hf_output_str = output_str \
+        .replace(image_token_str * vlm_config.image_feature_size, "") \
+        .replace("<s>", " ").replace("<|user|>", "") \
+        .replace("<|end|>\n<|assistant|>", " ")
+
+    return hf_input_ids, hf_output_str
+
+
+target_dtype = "half"
+if is_cpu():
+    target_dtype = "bfloat16"
+
+
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
+# Since we use _attn_implementation="eager" for hf_runner, here is
+# numeric difference for longer context and test can't pass
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [8])
+def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
+                model_and_config, dtype: str, max_tokens: int) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model_id, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
+
+    vllm_image_prompts = [
+        p.replace("<|image_1|>",
+                  "<|image|>" * vlm_config.image_feature_size + "<s>")
+        for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(model_id,
+                     max_model_len=2048,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id)
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index bed6f518c..f9ec72096 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -49,6 +49,7 @@ _GENERATION_MODELS = {
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
new file mode 100644
index 000000000..e8f190d3f
--- /dev/null
+++ b/vllm/model_executor/models/phi3v.py
@@ -0,0 +1,379 @@
+# coding=utf-8
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
+from transformers.utils import logging
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import get_dummy_image_data
+from vllm.sequence import SamplerOutput
+
+logger = logging.get_logger(__name__)
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "model.vision_embed_tokens": "vision_embed_tokens",
+}
+
+CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
+                                                     hidden_act="quick_gelu",
+                                                     hidden_size=1024,
+                                                     image_size=336,
+                                                     intermediate_size=4096,
+                                                     num_attention_heads=16,
+                                                     num_channels=3,
+                                                     num_hidden_layers=24,
+                                                     patch_size=14,
+                                                     projection_dim=768)
+
+
+class Phi3ImageEmbeddingBase(nn.Module):
+
+    def __init__(self, wte=None) -> None:
+        super().__init__()
+        self.wte = wte
+        self.layer_idx: int
+        self.type_feature: str
+        self.img_processor: CLIPVisionModel
+
+    def set_img_features(self, img_features: torch.FloatTensor) -> None:
+        self.img_features = img_features
+
+    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
+        self.img_sizes = img_sizes
+
+    def get_img_features(self,
+                         img_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        LAYER_IDX = self.layer_idx
+        TYPE_FEATURE = self.type_feature
+
+        img_processor_output = self.img_processor(img_embeds,
+                                                  output_hidden_states=True)
+        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+
+        if TYPE_FEATURE == "patch":
+            patch_feature = img_feature[:, 1:]
+            return patch_feature
+
+        if TYPE_FEATURE == "cls_patch":
+            return img_feature
+
+        raise NotImplementedError
+
+
+# adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py
+class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
+    """Phi3 Image embedding with HD transform."""
+
+    def __init__(self,
+                 vision_language_config: VisionLanguageConfig,
+                 config: PretrainedConfig,
+                 wte=None) -> None:
+        super().__init__(wte)
+
+        self.image_token_id = vision_language_config.image_token_id
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(
+            config, 'n_embd') else config.hidden_size
+
+        clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
+        self.img_processor = CLIPVisionModel(clip_config)
+        image_dim_out = config.img_processor['image_dim_out']
+        self.num_img_tokens = config.img_processor['num_img_tokens']
+
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = config.embd_layer.get('use_hd_transform',
+                                                      False)
+        self.with_learnable_separator = config.embd_layer.get(
+            'with_learnable_separator', False)
+        self.hd_transform_order = config.embd_layer.get(
+            'hd_transform_order', 'glb_sub')
+        # with_hd_transform and with_learnable_separator should have same value
+        assert self.use_hd_transform and self.with_learnable_separator
+
+        # 1024 * 4, merge spatial to channel dimension
+        self.glb_GN = nn.Parameter(torch.empty([1, 1, self.image_dim_out * 4]))
+        self.sub_GN = nn.Parameter(
+            torch.empty([1, 1, 1, self.image_dim_out * 4]))
+
+        dim_projection = hidden_size
+        depth = 2
+        layers = [nn.Linear(image_dim_out * 4, dim_projection)]
+        for _ in range(1, depth):
+            layers.extend(
+                [nn.GELU(),
+                 nn.Linear(dim_projection, dim_projection)])
+        self.img_projection = nn.Sequential(*layers)
+
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+
+        self.layer_idx = config.img_processor.get('layer_idx', -2)
+        self.type_feature = config.img_processor.get('type_feature', 'patch')
+
+    def forward(self,
+                input_ids: torch.LongTensor,
+                pixel_values: torch.FloatTensor,
+                image_sizes=None) -> torch.FloatTensor:
+        """process and merge text embeddings with image embeddings."""
+
+        img_embeds = pixel_values
+        img_sizes = image_sizes
+
+        if self.img_features is not None:
+            img_embeds = self.img_features.clone()
+            self.img_features = None
+
+        if self.img_sizes is not None:
+            img_sizes = self.img_sizes
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        positions = torch.nonzero(input_ids == self.image_token_id)
+
+        select = False
+
+        target_device = self.img_projection[0].bias.device
+        target_dtype = self.img_projection[0].bias.dtype
+
+        if len(positions.tolist()) > 0:
+            # if self.use_hd_transform and img_sizes:
+            # img_embeds: (num_images, max_num_crops, 3, H, W)
+            # img_sizes: (num_images, 2).view(1, -1)
+
+            bs = img_embeds.shape[0]
+            # Nx(HW)xC
+            img_features = self.get_img_features(img_embeds.flatten(0, 1))
+            base_feat_height = base_feat_width = int(
+                img_features.shape[1]**0.5)
+
+            # bs x max_num_crops x (24x24) x C
+            img_features = img_features.view(
+                bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
+            C = self.image_dim_out
+            H = base_feat_height
+
+            output_imgs = []
+            output_len = []
+
+            if isinstance(img_sizes, torch.Tensor):
+                img_sizes.squeeze_(0)
+
+            for _bs in range(bs):
+                h, w = img_sizes
+                h = h // 336
+                w = w // 336
+                B_ = h * w
+
+                # 1 x (24x24) x 1024
+                global_img_feature = img_features[_bs, :1]
+
+                # 1 x 12 x 12 x 4096
+                glb_img = global_img_feature \
+                    .reshape(1, H // 2, 2, H // 2, 2,C) \
+                    .permute(0, 1, 3, 2, 4, 5) \
+                    .reshape(1, H // 2, H // 2, 4 * C)
+                temp_glb_GN = self.sub_GN.repeat(1, H // 2, 1, 1)
+
+                # 1 x 156 x 4096
+                glb_img = torch.cat([glb_img, temp_glb_GN],
+                                    dim=2).reshape(1, -1, 4 * C)
+
+                # (max_num_crops-1) x (12x12) x C
+                sub_img = img_features[_bs, 1:]
+                # 16x574x1024
+                # get rid of padding sub_img
+                sub_img = sub_img[:B_]
+
+                sub_img = sub_img.reshape(B_, H // 2, 2, H // 2, 2, C) \
+                    .permute(0, 1, 3, 2, 4, 5).reshape(B_, -1, 4 * C)
+                sub_img = sub_img.reshape(1, h, w, 12, 12, -1) \
+                    .permute(0, 1, 3, 2, 4, 5) \
+                    .reshape(1, h * 12, w * 12, 4 * C)
+                temp_sub_GN = self.sub_GN.repeat(1, h * 12, 1, 1)
+                sub_img = torch.cat([sub_img, temp_sub_GN],
+                                    dim=2).reshape(1, -1, 4 * C)
+                # (1, num_img_tokens, 1024*4)
+
+                # glb + sub
+                if self.hd_transform_order == 'glb_sub':
+                    output_imgs.append(
+                        torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+                elif self.hd_transform_order == 'sub_glb':
+                    output_imgs.append(
+                        torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+
+                temp_len = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
+                output_len.append(temp_len)
+
+            num_img_tokens = output_len
+            img_set_tensor = []
+            for _output_img in output_imgs:
+                img_feature_proj = self.img_projection(
+                    _output_img.to(target_device, target_dtype))
+                img_set_tensor.append(img_feature_proj)
+            select = True
+
+        input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
+
+        hidden_states = self.wte(input_ids)
+
+        if select:
+            idx = 0
+            for i, cnt in enumerate(num_img_tokens):
+                hidden_states[positions[idx, 0],
+                              positions[idx, 1]:positions[idx, 1] +
+                              cnt] = (img_set_tensor[i].to(
+                                  hidden_states.device, hidden_states.dtype))
+                idx += cnt
+
+        return hidden_states.squeeze(0)
+
+
+class Phi3VImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+
+    image_sizes: torch.Tensor
+    """Shape: (batch_size, 2)"""
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input()
+@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
+class Phi3VForCausalLM(VisionLanguageModelBase):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 vision_language_config: VisionLanguageConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__(vision_language_config)
+        self.config = config
+        self.model = LlamaModel(config, cache_config, quant_config)
+        self.vision_embed_tokens = Phi3HDImageEmbedding(
+            vision_language_config, config, self.model.embed_tokens)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Phi3VImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+
+        expected_input_type = self.vision_language_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+
+        if expected_input_type != ImageInputType.PIXEL_VALUES:
+            raise ValueError(
+                f"Unexpected image input type: {expected_input_type}."
+                "Phi3v only support pixel_values input currently.")
+
+        if pixel_values is not None and image_sizes is not None:
+            return Phi3VImagePixelInputs(type="pixel_values",
+                                         data=pixel_values,
+                                         image_sizes=image_sizes)
+
+        return None
+
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                attn_metadata: AttentionMetadata, **kwargs: object):
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            inputs_embeds = self.vision_embed_tokens(
+                input_ids, image_input["data"], image_input["image_sizes"])
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.model(input_ids,
+                                   positions,
+                                   kv_caches,
+                                   attn_metadata,
+                                   inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # We only do sharding for language model
+                # and not vision model for now.
+                if "vision_embed_tokens" in name and self.vision_embed_tokens:
+                    continue
+                if weight_name not in name:
+                    continue
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c6311d60e..509f791d2 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -79,6 +79,8 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
 
     if config.hf_config.model_type in ("llava", "llava_next"):
         full_prompt = f"{image_prompt}\n{text_prompt}"
+    elif config.hf_config.model_type == 'phi3_v':
+        full_prompt = f"{image_prompt}<s>\n{text_prompt}"
     else:
         raise ValueError(
             f"Unsupported model type: {config.hf_config.model_type}")
-- 
GitLab


From 5002175e801703c5b8a1411b490f6ff6c1747c8e Mon Sep 17 00:00:00 2001
From: Joe Runde <joe@joerun.de>
Date: Mon, 17 Jun 2024 21:54:11 -0600
Subject: [PATCH 077/376] [Kernel] Add punica dimensions for Granite 13b
 (#5559)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 csrc/punica/bgmv/bgmv_config.h | 8 ++++++++
 tests/lora/test_punica.py      | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index 4b376261d..0456b4bc2 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -20,6 +20,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 1152) \
     f(in_T, out_T, W_T, narrow, 1280) \
     f(in_T, out_T, W_T, narrow, 1536) \
+    f(in_T, out_T, W_T, narrow, 1664) \
     f(in_T, out_T, W_T, narrow, 1728) \
     f(in_T, out_T, W_T, narrow, 1792) \
     f(in_T, out_T, W_T, narrow, 2048) \
@@ -36,6 +37,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 5120) \
     f(in_T, out_T, W_T, narrow, 5504) \
     f(in_T, out_T, W_T, narrow, 5632) \
+    f(in_T, out_T, W_T, narrow, 5888) \
     f(in_T, out_T, W_T, narrow, 6144) \
     f(in_T, out_T, W_T, narrow, 6400) \
     f(in_T, out_T, W_T, narrow, 6848) \
@@ -45,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 9216) \
     f(in_T, out_T, W_T, narrow, 10240) \
     f(in_T, out_T, W_T, narrow, 11008) \
+    f(in_T, out_T, W_T, narrow, 11264) \
     f(in_T, out_T, W_T, narrow, 12288) \
     f(in_T, out_T, W_T, narrow, 13696) \
     f(in_T, out_T, W_T, narrow, 13824) \
@@ -53,6 +56,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 16384) \
     f(in_T, out_T, W_T, narrow, 20480) \
     f(in_T, out_T, W_T, narrow, 22016) \
+    f(in_T, out_T, W_T, narrow, 22528) \
     f(in_T, out_T, W_T, narrow, 24576) \
     f(in_T, out_T, W_T, narrow, 27392) \
     f(in_T, out_T, W_T, narrow, 27648) \
@@ -91,6 +95,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 1152, narrow) \
     f(in_T, out_T, W_T, 1280, narrow) \
     f(in_T, out_T, W_T, 1536, narrow) \
+    f(in_T, out_T, W_T, 1664, narrow) \
     f(in_T, out_T, W_T, 1728, narrow) \
     f(in_T, out_T, W_T, 1792, narrow) \
     f(in_T, out_T, W_T, 2048, narrow) \
@@ -107,6 +112,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 5120, narrow) \
     f(in_T, out_T, W_T, 5504, narrow) \
     f(in_T, out_T, W_T, 5632, narrow) \
+    f(in_T, out_T, W_T, 5888, narrow) \
     f(in_T, out_T, W_T, 6144, narrow) \
     f(in_T, out_T, W_T, 6400, narrow) \
     f(in_T, out_T, W_T, 6848, narrow) \
@@ -116,6 +122,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 9216, narrow) \
     f(in_T, out_T, W_T, 10240, narrow) \
     f(in_T, out_T, W_T, 11008, narrow) \
+    f(in_T, out_T, W_T, 11264, narrow) \
     f(in_T, out_T, W_T, 12288, narrow) \
     f(in_T, out_T, W_T, 13696, narrow) \
     f(in_T, out_T, W_T, 13824, narrow) \
@@ -124,6 +131,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 16384, narrow) \
     f(in_T, out_T, W_T, 20480, narrow) \
     f(in_T, out_T, W_T, 22016, narrow) \
+    f(in_T, out_T, W_T, 22528, narrow) \
     f(in_T, out_T, W_T, 24576, narrow) \
     f(in_T, out_T, W_T, 27392, narrow) \
     f(in_T, out_T, W_T, 27648, narrow) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index f021c003b..d87658e5d 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -53,6 +53,7 @@ H1 = H2 = [
     1152,
     1280,
     1536,
+    1664,
     2048,
     2304,
     2560,
@@ -66,6 +67,7 @@ H1 = H2 = [
     5120,
     5504,
     5632,
+    5888,
     6144,
     6400,
     6848,
@@ -75,10 +77,12 @@ H1 = H2 = [
     9216,
     10240,
     11008,
+    11264,
     13824,
     14336,
     15360,
     22016,
+    22528,
     24576,
     27392,
     27648,
-- 
GitLab


From 8eadcf0b90f126cf9b23f9583a53b19b6b58fd87 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 17 Jun 2024 20:54:57 -0700
Subject: [PATCH 078/376] [misc][typo] fix typo (#5620)

---
 vllm/block.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/block.py b/vllm/block.py
index e7fb29c8c..bd00c07ad 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -13,7 +13,7 @@ TokensBlock = List[int]
 
 
 class BlockPool:
-    """A pool of physical blocks.
+    """A pool of logical blocks.
     When requests come, we create a lot of logical blocks;
     when requests are done, we destroy a lot of logical blocks.
     It turns out that creating and destroying logical blocks can be expensive,
-- 
GitLab


From 32c86e494a49dff8d1d4b10c5922a36daa6e8faf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 18 Jun 2024 11:58:30 +0800
Subject: [PATCH 079/376] [Misc] Fix typo (#5618)

---
 .../scripts/convert-results-json-to-markdown.py                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 9aa8162d1..e1002213f 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -21,7 +21,7 @@ latency_column_mapping = {
     "P99": "P99",
 }
 
-# thoughput tests and the keys that will be printed into markdown
+# throughput tests and the keys that will be printed into markdown
 throughput_results = []
 throughput_results_column_mapping = {
     "test_name": "Test name",
-- 
GitLab


From 114d7270ffc2e5a66e0974b0d6d913c7f990afa7 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 17 Jun 2024 21:37:18 -0700
Subject: [PATCH 080/376] [CI] Avoid naming different metrics with the same
 name in performance benchmark (#5615)

---
 .../convert-results-json-to-markdown.py       | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index e1002213f..534ecf179 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -15,10 +15,10 @@ latency_column_mapping = {
     "avg_latency": "Mean latency (ms)",
     # "P10": "P10 (s)",
     # "P25": "P25 (s)",
-    "P50": "Median",
+    "P50": "Median latency (ms)",
     # "P75": "P75 (s)",
     # "P90": "P90 (s)",
-    "P99": "P99",
+    "P99": "P99 latency (ms)",
 }
 
 # throughput tests and the keys that will be printed into markdown
@@ -43,15 +43,14 @@ serving_column_mapping = {
     # "input_throughput": "Input Tput (tok/s)",
     # "output_throughput": "Output Tput (tok/s)",
     "mean_ttft_ms": "Mean TTFT (ms)",
-    # do not say TTFT again to avoid the table getting too wide
-    "median_ttft_ms": "Median",
-    "p99_ttft_ms": "P99",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "p99_ttft_ms": "P99 TTFT (ms)",
     # "mean_tpot_ms": "Mean TPOT (ms)",
     # "median_tpot_ms": "Median",
     # "p99_tpot_ms": "P99",
     "mean_itl_ms": "Mean ITL (ms)",
-    "median_itl_ms": "Median",
-    "p99_itl_ms": "P99",
+    "median_itl_ms": "Median ITL (ms)",
+    "p99_itl_ms": "P99 ITL (ms)",
 }
 
 
@@ -183,3 +182,11 @@ if __name__ == "__main__":
             serving_tests_markdown_table=serving_md_table,
             benchmarking_results_in_json_string=processed_results_json)
         f.write(results)
+
+    # document benchmarking results in json
+    with open(results_folder / "benchmark_results.json", "w") as f:
+
+        results = latency_results.to_dict(
+            orient='records') + throughput_results.to_dict(
+                orient='records') + serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))
-- 
GitLab


From db5ec52ad7dc69dbe8dd9ba25fe8f2c6ce35a4cf Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 18 Jun 2024 00:21:05 -0700
Subject: [PATCH 081/376] [bugfix][distributed] improve p2p capability test
 (#5612)

[bugfix][distributed] do not error if two processes do not agree on p2p capability (#5612)
---
 .../device_communicators/custom_all_reduce_utils.py    | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 75b7c374c..e0641a54c 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -71,6 +71,7 @@ def consumer(batch_tgt: Sequence[int],
         if open_success:
             # modify the memory
             lib.cudaMemset(pointer, 2, 1024)
+            lib.cudaDeviceSynchronize()
             # use two queues to simulate barrier
             producer_queue.get()
             consumer_queue.put(0)
@@ -142,8 +143,13 @@ def can_actually_p2p(
     for src, tgt in zip(batch_src, batch_tgt):
         a = result_queue.get()
         b = result_queue.get()
-        assert a == b
-        result.append(a)
+        if a != b:
+            logger.warning(
+                "Two processes do not agree on the P2P access"
+                " status on %d -> %d, treat as disabled.", src, tgt)
+            result.append(False)
+        else:
+            result.append(a)
     return result
 
 
-- 
GitLab


From f0cc0e68e3ceef6fe43f78bf36df88e6cad28766 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 18 Jun 2024 05:12:19 -0700
Subject: [PATCH 082/376] [Misc] Remove import from transformers logging
 (#5625)

---
 vllm/model_executor/models/phi3v.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index e8f190d3f..35f3b894f 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -18,7 +18,6 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 import torch
 import torch.nn as nn
 from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
-from transformers.utils import logging
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
@@ -35,8 +34,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
-logger = logging.get_logger(__name__)
-
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }
-- 
GitLab


From 4ad7b53e59b6600d050581329dfaba0222b13ae5 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 18 Jun 2024 06:10:04 -0700
Subject: [PATCH 083/376] [CI/Build][Misc] Update Pytest Marker for VLMs
 (#5623)

---
 .buildkite/run-cpu-test.sh      | 2 +-
 .buildkite/test-pipeline.yaml   | 6 +++---
 pyproject.toml                  | 2 +-
 tests/models/test_llava.py      | 2 +-
 tests/models/test_llava_next.py | 2 +-
 tests/models/test_phi3v.py      | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 532d6ad88..f4fa24be1 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,4 +23,4 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
   cd ../
-  pytest -v -s tests/models -m \"not llava\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6439a315e..c1e433ec4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -100,13 +100,13 @@ steps:
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - pytest -v -s models -m \"not llava\"
+    - pytest -v -s models -m \"not vlm\"
 
-- label: Llava Test
+- label: Vision Language Models Test
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m llava
+    - pytest -v -s models -m vlm
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]
diff --git a/pyproject.toml b/pyproject.toml
index eb691c297..4958aae02 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,5 +71,5 @@ markers = [
     "skip_global_cleanup",
     "llm: run tests for vLLM API only",
     "openai: run tests for OpenAI API only",
-    "llava: run tests for LLaVA models only",
+    "vlm: run tests for vision language models only",
 ]
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index a1f0cff1c..b41c69f72 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -7,7 +7,7 @@ from vllm.config import VisionLanguageConfig
 
 from ..conftest import IMAGE_FILES
 
-pytestmark = pytest.mark.llava
+pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index aa6ee268a..0eca5cb53 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -7,7 +7,7 @@ from vllm.config import VisionLanguageConfig
 
 from ..conftest import IMAGE_FILES
 
-pytestmark = pytest.mark.llava
+pytestmark = pytest.mark.vlm
 
 _PREFACE = (
     "A chat between a curious human and an artificial intelligence assistant. "
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 607ad95e8..1732e8f08 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -8,7 +8,7 @@ from vllm.utils import is_cpu
 
 from ..conftest import IMAGE_FILES
 
-pytestmark = pytest.mark.llava
+pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [
-- 
GitLab


From 13db4369d9ab3158a01192d60c744c6523961824 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 18 Jun 2024 07:26:20 -0700
Subject: [PATCH 084/376] [ci] Deprecate original CI template (#5624)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml |   2 +-
 .buildkite/test-template.j2   | 101 ----------------------------------
 2 files changed, 1 insertion(+), 102 deletions(-)
 delete mode 100644 .buildkite/test-template.j2

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c1e433ec4..a81885b8a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1,6 +1,6 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template.j2` to generate
+# This script will be feed into Jinja template in `test-template-aws.j2` to generate
 # the final pipeline yaml file.
 
 steps:
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
deleted file mode 100644
index 3bd1e90c2..000000000
--- a/.buildkite/test-template.j2
+++ /dev/null
@@ -1,101 +0,0 @@
-{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
-{% set default_num_gpu = 1 %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    commands:
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        agents:
-          queue: amd
-        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-        soft_fail: true
-    {% endif %}
-    {% endfor %}
-
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: false
-
-  - label: "Intel Test"
-    depends_on: ~
-    agents:
-      queue: intel
-    command: bash .buildkite/run-cpu-test.sh
-
-  - label: "XPU Test"
-    agents:
-      queue: intel
-    command: bash .buildkite/run-xpu-test.sh
-
-  {% for step in steps %}
-  - label: "{{ step.label }}"
-    agents:
-      queue: kubernetes
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - kubernetes:
-          podSpec:
-            {% if step.num_gpus %}
-            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
-            {% endif %}
-            volumes:
-              - name: dshm
-                emptyDir:
-                  medium: Memory
-            containers:
-              - image: "{{ docker_image }}"
-                command: ["bash"]
-                args:
-                - '-c'
-                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
-                {% if not step.no_gpu %}
-                resources:
-                  requests:
-                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
-                  limits:
-                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
-                {% endif %}
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                volumeMounts:
-                  - mountPath: /dev/shm
-                    name: dshm
-  {% endfor %}
-- 
GitLab


From 7879f24dcce75665d83865ee8281f2ef1bbb7e74 Mon Sep 17 00:00:00 2001
From: Ronen Schaffer <ronen.schaffer@ibm.com>
Date: Tue, 18 Jun 2024 19:17:03 +0300
Subject: [PATCH 085/376] [Misc] Add OpenTelemetry support (#4687)

This PR adds basic support for OpenTelemetry distributed tracing.
It includes changes to enable tracing functionality and improve monitoring capabilities.

I've also added a markdown with print-screens to guide users how to use this feature. You can find it here
---
 .buildkite/test-pipeline.yaml                 |   9 ++
 benchmarks/benchmark_latency.py               |  48 +++++---
 examples/production_monitoring/Otel.md        |  82 +++++++++++++
 .../production_monitoring/dummy_client.py     |  35 ++++++
 tests/tracing/__init__.py                     |   0
 tests/tracing/test_tracing.py                 | 116 ++++++++++++++++++
 vllm/config.py                                |  13 ++
 vllm/engine/arg_utils.py                      |  40 ++++--
 vllm/engine/async_llm_engine.py               |  22 ++++
 vllm/engine/llm_engine.py                     | 102 +++++++++++++--
 vllm/entrypoints/openai/serving_chat.py       |  11 ++
 vllm/entrypoints/openai/serving_completion.py |  11 ++
 vllm/sequence.py                              |   3 +
 vllm/tracing.py                               | 104 ++++++++++++++++
 vllm/utils.py                                 |  12 ++
 15 files changed, 567 insertions(+), 41 deletions(-)
 create mode 100644 examples/production_monitoring/Otel.md
 create mode 100644 examples/production_monitoring/dummy_client.py
 create mode 100644 tests/tracing/__init__.py
 create mode 100644 tests/tracing/test_tracing.py
 create mode 100644 vllm/tracing.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a81885b8a..5afe37302 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -159,6 +159,15 @@ steps:
   #mirror_hardwares: [amd]
   command: pytest -v -s quantization
 
+- label: Tracing Test
+  commands: 
+    - "pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai"
+    - pytest -v -s tracing
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   mirror_hardwares: [amd]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 767afd21a..98e0be277 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -20,26 +20,29 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(model=args.model,
-              speculative_model=args.speculative_model,
-              num_speculative_tokens=args.num_speculative_tokens,
-              tokenizer=args.tokenizer,
-              quantization=args.quantization,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              enforce_eager=args.enforce_eager,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.quantization_param_path,
-              device=args.device,
-              ray_workers_use_nsight=args.ray_workers_use_nsight,
-              use_v2_block_manager=args.use_v2_block_manager,
-              enable_chunked_prefill=args.enable_chunked_prefill,
-              download_dir=args.download_dir,
-              block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization,
-              load_format=args.load_format,
-              distributed_executor_backend=args.distributed_executor_backend)
+    llm = LLM(
+        model=args.model,
+        speculative_model=args.speculative_model,
+        num_speculative_tokens=args.num_speculative_tokens,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        tensor_parallel_size=args.tensor_parallel_size,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        quantization_param_path=args.quantization_param_path,
+        device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        use_v2_block_manager=args.use_v2_block_manager,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        download_dir=args.download_dir,
+        block_size=args.block_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        load_format=args.load_format,
+        distributed_executor_backend=args.distributed_executor_backend,
+        otlp_traces_endpoint=args.otlp_traces_endpoint,
+    )
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -254,5 +257,10 @@ if __name__ == '__main__':
         help='Backend to use for distributed serving. When more than 1 GPU '
         'is used, will be automatically set to "ray" if installed '
         'or "mp" (multiprocessing) otherwise.')
+    parser.add_argument(
+        '--otlp-traces-endpoint',
+        type=str,
+        default=None,
+        help='Target URL to which OpenTelemetry traces will be sent.')
     args = parser.parse_args()
     main(args)
diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md
new file mode 100644
index 000000000..144944227
--- /dev/null
+++ b/examples/production_monitoring/Otel.md
@@ -0,0 +1,82 @@
+# Setup OpenTelemetry POC
+
+1. Install OpenTelemetry packages:
+    ```
+    pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai
+    ```
+
+1. Start Jaeger in a docker container:
+    ```
+    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
+    docker run --rm --name jaeger \
+        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
+        -p 6831:6831/udp \
+        -p 6832:6832/udp \
+        -p 5778:5778 \
+        -p 16686:16686 \
+        -p 4317:4317 \
+        -p 4318:4318 \
+        -p 14250:14250 \
+        -p 14268:14268 \
+        -p 14269:14269 \
+        -p 9411:9411 \
+        jaegertracing/all-in-one:1.57
+    ```
+
+1. In a new shell, export Jaeger IP:
+    ```
+    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    ```
+    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
+    ```
+    export OTEL_SERVICE_NAME="vllm-server"
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    ```
+
+1. In a new shell, send requests with trace context from a dummy client
+    ```
+    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    export OTEL_SERVICE_NAME="client-service"
+    python dummy_client.py
+    ```
+
+1. Open Jaeger webui: http://localhost:16686/
+
+    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
+    ![Traces](https://i.imgur.com/GYHhFjo.png)
+
+1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
+![Spans details](https://i.imgur.com/OPf6CBL.png)
+
+## Exporter Protocol
+OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
+By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
+```
+export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+```
+
+## Instrumentation of FastAPI
+OpenTelemetry allows automatic instrumentation of FastAPI.
+1. Install the instrumentation library
+    ```
+    pip install opentelemetry-instrumentation-fastapi
+    ```
+
+1. Run vLLM with `opentelemetry-instrument`
+    ```
+    opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" 
+    ```
+
+1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
+
+![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file
diff --git a/examples/production_monitoring/dummy_client.py b/examples/production_monitoring/dummy_client.py
new file mode 100644
index 000000000..b1a2b3c3c
--- /dev/null
+++ b/examples/production_monitoring/dummy_client.py
@@ -0,0 +1,35 @@
+import requests
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+    OTLPSpanExporter)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
+                                            ConsoleSpanExporter)
+from opentelemetry.trace import SpanKind, set_tracer_provider
+from opentelemetry.trace.propagation.tracecontext import (
+    TraceContextTextMapPropagator)
+
+trace_provider = TracerProvider()
+set_tracer_provider(trace_provider)
+
+trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+tracer = trace_provider.get_tracer("dummy-client")
+
+url = "http://localhost:8000/v1/completions"
+with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
+    prompt = "San Francisco is a"
+    span.set_attribute("prompt", prompt)
+    headers = {}
+    TraceContextTextMapPropagator().inject(headers)
+    payload = {
+        "model": "facebook/opt-125m",
+        "prompt": prompt,
+        "max_tokens": 10,
+        "best_of": 20,
+        "n": 3,
+        "use_beam_search": "true",
+        "temperature": 0.0,
+        # "stream": True,
+    }
+    response = requests.post(url, headers=headers, json=payload)
diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
new file mode 100644
index 000000000..2f8f62cf2
--- /dev/null
+++ b/tests/tracing/test_tracing.py
@@ -0,0 +1,116 @@
+import os
+import threading
+from concurrent import futures
+from typing import Callable, Dict, Iterable, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: Dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service():
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(trace_service):
+    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+
+    sampling_params = SamplingParams(temperature=0.01,
+                                     top_p=0.1,
+                                     max_tokens=256)
+    model = "facebook/opt-125m"
+    llm = LLM(
+        model=model,
+        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+    )
+    prompts = ["This is a short prompt"]
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+    timeout = 5
+    if not trace_service.evt.wait(timeout):
+        raise TimeoutError(
+            f"The fake trace service didn't receive a trace within "
+            f"the {timeout} seconds timeout")
+
+    attributes = decode_attributes(trace_service.request.resource_spans[0].
+                                   scope_spans[0].spans[0].attributes)
+    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
+    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
+    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
+        outputs[0].prompt_token_ids)
+    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+    assert attributes.get(
+        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
+    metrics = outputs[0].metrics
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+    ttft = metrics.first_token_time - metrics.arrival_time
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+    e2e_time = metrics.finished_time - metrics.arrival_time
+    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
diff --git a/vllm/config.py b/vllm/config.py
index d95faf52d..5de00d7d3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -10,6 +10,7 @@ from transformers import PretrainedConfig, PreTrainedTokenizerBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
+from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
                         is_hip, is_neuron, is_tpu, is_xpu)
@@ -1371,6 +1372,17 @@ class DecodingConfig:
                              f"must be one of {valid_guided_backends}")
 
 
+@dataclass
+class ObservabilityConfig:
+    """Configuration for observability."""
+    otlp_traces_endpoint: Optional[str] = None
+
+    def __post_init__(self):
+        if not is_otel_installed() and self.otlp_traces_endpoint is not None:
+            raise ValueError("OpenTelemetry packages must be installed before "
+                             "configuring 'otlp_traces_endpoint'")
+
+
 @dataclass(frozen=True)
 class EngineConfig:
     """Dataclass which contains all engine-related configuration. This
@@ -1387,6 +1399,7 @@ class EngineConfig:
     vision_language_config: Optional[VisionLanguageConfig]
     speculative_config: Optional[SpeculativeConfig]
     decoding_config: Optional[DecodingConfig]
+    observability_config: Optional[ObservabilityConfig]
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9d04f1dc5..647793a6d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,8 +7,9 @@ from typing import List, Optional, Tuple, Union
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         TokenizerPoolConfig, VisionLanguageConfig)
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, TokenizerPoolConfig,
+                         VisionLanguageConfig)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import str_to_int_tuple
 
@@ -101,6 +102,8 @@ class EngineArgs:
 
     qlora_adapter_name_or_path: Optional[str] = None
 
+    otlp_traces_endpoint: Optional[str] = None
+
     def __post_init__(self):
         if self.tokenizer is None:
             self.tokenizer = self.model
@@ -599,6 +602,13 @@ class EngineArgs:
                             type=str,
                             default=None,
                             help='Name or path of the QLoRA adapter.')
+
+        parser.add_argument(
+            '--otlp-traces-endpoint',
+            type=str,
+            default=None,
+            help='Target URL to which OpenTelemetry traces will be sent.')
+
         return parser
 
     @classmethod
@@ -757,6 +767,9 @@ class EngineArgs:
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
 
+        observability_config = ObservabilityConfig(
+            otlp_traces_endpoint=self.otlp_traces_endpoint)
+
         if (model_config.get_sliding_window() is not None
                 and scheduler_config.chunked_prefill_enabled
                 and not scheduler_config.use_v2_block_manager):
@@ -764,16 +777,19 @@ class EngineArgs:
                 "Chunked prefill is not supported with sliding window. "
                 "Set --disable-sliding-window to disable sliding window.")
 
-        return EngineConfig(model_config=model_config,
-                            cache_config=cache_config,
-                            parallel_config=parallel_config,
-                            scheduler_config=scheduler_config,
-                            device_config=device_config,
-                            lora_config=lora_config,
-                            vision_language_config=vision_language_config,
-                            speculative_config=speculative_config,
-                            load_config=load_config,
-                            decoding_config=decoding_config)
+        return EngineConfig(
+            model_config=model_config,
+            cache_config=cache_config,
+            parallel_config=parallel_config,
+            scheduler_config=scheduler_config,
+            device_config=device_config,
+            lora_config=lora_config,
+            vision_language_config=vision_language_config,
+            speculative_config=speculative_config,
+            load_config=load_config,
+            decoding_config=decoding_config,
+            observability_config=observability_config,
+        )
 
 
 @dataclass
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index ab312850b..86720e4fb 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -244,6 +244,9 @@ class _AsyncLLMEngine(LLMEngine):
         # Log stats.
         self.do_log_stats(scheduler_outputs, output)
 
+        # Tracing
+        self.do_tracing(scheduler_outputs)
+
         if not request_outputs:
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
@@ -285,6 +288,7 @@ class _AsyncLLMEngine(LLMEngine):
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -301,6 +305,7 @@ class _AsyncLLMEngine(LLMEngine):
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            trace_headers=trace_headers,
         )
 
     async def check_health_async(self) -> None:
@@ -556,6 +561,7 @@ class AsyncLLMEngine:
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> AsyncStream:
         if self.log_requests:
             if isinstance(inputs, str):
@@ -597,6 +603,7 @@ class AsyncLLMEngine:
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            trace_headers=trace_headers,
         )
 
         return stream
@@ -607,6 +614,7 @@ class AsyncLLMEngine:
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> AsyncIterator[RequestOutput]:
         """Generate outputs for a request.
 
@@ -621,6 +629,7 @@ class AsyncLLMEngine:
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.
 
         Yields:
             The output `RequestOutput` objects from the LLMEngine
@@ -674,6 +683,7 @@ class AsyncLLMEngine:
                 inputs,
                 sampling_params,
                 lora_request=lora_request,
+                trace_headers=trace_headers,
         ):
             yield LLMEngine.validate_output(output, RequestOutput)
 
@@ -683,6 +693,7 @@ class AsyncLLMEngine:
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> AsyncIterator[EmbeddingRequestOutput]:
         """Generate outputs for a request from an embedding model.
 
@@ -697,6 +708,7 @@ class AsyncLLMEngine:
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.
 
         Yields:
             The output `EmbeddingRequestOutput` objects from the LLMEngine
@@ -748,6 +760,7 @@ class AsyncLLMEngine:
                 inputs,
                 pooling_params,
                 lora_request=lora_request,
+                trace_headers=trace_headers,
         ):
             yield LLMEngine.validate_output(output, EmbeddingRequestOutput)
 
@@ -758,6 +771,7 @@ class AsyncLLMEngine:
         params: Union[SamplingParams, PoolingParams],
         *,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Common logic to process requests with SamplingParams or
         PoolingParams."""
@@ -769,6 +783,7 @@ class AsyncLLMEngine:
             params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            trace_headers=trace_headers,
         )
 
         try:
@@ -848,3 +863,10 @@ class AsyncLLMEngine:
         else:
             await self.engine.check_health_async()
         logger.debug("Health check took %fs", time.perf_counter() - t)
+
+    async def is_tracing_enabled(self) -> bool:
+        if self.engine_use_ray:
+            return await self.engine.is_tracing_enabled.remote(  # type: ignore
+            )
+        else:
+            return self.engine.is_tracing_enabled()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index eed9a17e4..75d417f52 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,14 +1,14 @@
 import time
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional
+from typing import TYPE_CHECKING, ClassVar, Dict, Iterable, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Set, Type, TypeVar, Union
 
 from transformers import GenerationConfig, PreTrainedTokenizer
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
-                         LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig,
+                         LoRAConfig, ModelConfig, ObservabilityConfig,
+                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VisionLanguageConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
@@ -31,6 +31,8 @@ from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
                            PoolerOutput, SamplerOutput, Sequence,
                            SequenceGroup, SequenceGroupMetadata,
                            SequenceStatus)
+from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
+                          init_tracer)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
@@ -154,6 +156,7 @@ class LLMEngine:
         vision_language_config: Optional[VisionLanguageConfig],
         speculative_config: Optional[SpeculativeConfig],
         decoding_config: Optional[DecodingConfig],
+        observability_config: Optional[ObservabilityConfig],
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -168,7 +171,8 @@ class LLMEngine:
             "disable_custom_all_reduce=%s, quantization=%s, "
             "enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
-            "decoding_config=%r, seed=%d, served_model_name=%s)",
+            "decoding_config=%r, observability_config=%r, "
+            "seed=%d, served_model_name=%s)",
             VLLM_VERSION,
             model_config.model,
             speculative_config,
@@ -192,6 +196,7 @@ class LLMEngine:
             model_config.quantization_param_path,
             device_config.device,
             decoding_config,
+            observability_config,
             model_config.seed,
             model_config.served_model_name,
         )
@@ -207,6 +212,8 @@ class LLMEngine:
         self.speculative_config = speculative_config
         self.load_config = load_config
         self.decoding_config = decoding_config or DecodingConfig()
+        self.observability_config = observability_config or ObservabilityConfig(
+        )
         self.log_stats = log_stats
 
         if not self.model_config.skip_tokenizer_init:
@@ -288,6 +295,12 @@ class LLMEngine:
                 max_model_len=self.model_config.max_model_len)
             self.stat_logger.info("cache_config", self.cache_config)
 
+        self.tracer = None
+        if self.observability_config.otlp_traces_endpoint:
+            self.tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+
         # Create sequence output processor, e.g. for beam search or
         # speculative decoding.
         self.output_processor = (
@@ -444,6 +457,7 @@ class LLMEngine:
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
         lora_request: Optional[LoRARequest],
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         # Create the sequences.
         block_size = self.cache_config.block_size
@@ -461,6 +475,7 @@ class LLMEngine:
                 params,
                 arrival_time=arrival_time,
                 lora_request=lora_request,
+                trace_headers=trace_headers,
             )
         elif isinstance(params, PoolingParams):
             seq_group = self._create_sequence_group_with_pooling(
@@ -507,6 +522,7 @@ class LLMEngine:
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         """Add a request to the engine's request pool.
 
@@ -524,6 +540,7 @@ class LLMEngine:
                 :class:`~vllm.PoolingParams` for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
+            trace_headers: OpenTelemetry trace headers.
 
         Details:
             - Set arrival_time to the current time if it is None.
@@ -565,6 +582,7 @@ class LLMEngine:
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            trace_headers=trace_headers,
         )
 
     def _create_sequence_group_with_sampling(
@@ -574,6 +592,7 @@ class LLMEngine:
         sampling_params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest],
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> SequenceGroup:
         """Creates a SequenceGroup with SamplingParams."""
         max_logprobs = self.get_model_config().max_logprobs
@@ -595,11 +614,14 @@ class LLMEngine:
             self.generation_config_fields)
 
         # Create the sequence group.
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  sampling_params=sampling_params,
-                                  lora_request=lora_request)
+        seq_group = SequenceGroup(
+            request_id=request_id,
+            seqs=[seq],
+            arrival_time=arrival_time,
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+            trace_headers=trace_headers,
+        )
 
         return seq_group
 
@@ -793,6 +815,9 @@ class LLMEngine:
         # Log stats.
         self.do_log_stats(scheduler_outputs, output)
 
+        # Tracing
+        self.do_tracing(scheduler_outputs)
+
         if not request_outputs:
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
@@ -986,3 +1011,62 @@ class LLMEngine:
 
     def check_health(self) -> None:
         self.model_executor.check_health()
+
+    def is_tracing_enabled(self) -> bool:
+        return self.tracer is not None
+
+    def do_tracing(self, scheduler_outputs: SchedulerOutputs) -> None:
+        if self.tracer is None:
+            return
+
+        for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
+            seq_group = scheduled_seq_group.seq_group
+            if seq_group.is_finished():
+                self.create_trace_span(seq_group)
+
+    def create_trace_span(self, seq_group: SequenceGroup) -> None:
+        if self.tracer is None or seq_group.sampling_params is None:
+            return
+        arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)
+
+        trace_context = extract_trace_context(seq_group.trace_headers)
+
+        with self.tracer.start_as_current_span(
+                "llm_request",
+                kind=SpanKind.SERVER,
+                context=trace_context,
+                start_time=arrival_time_nano_seconds) as seq_span:
+            metrics = seq_group.metrics
+            ttft = metrics.first_token_time - metrics.arrival_time
+            e2e_time = metrics.finished_time - metrics.arrival_time
+            # attribute names are based on
+            # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
+            seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL,
+                                   self.model_config.model)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID,
+                                   seq_group.request_id)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE,
+                                   seq_group.sampling_params.temperature)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P,
+                                   seq_group.sampling_params.top_p)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS,
+                                   seq_group.sampling_params.max_tokens)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_BEST_OF,
+                                   seq_group.sampling_params.best_of)
+            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N,
+                                   seq_group.sampling_params.n)
+            seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES,
+                                   seq_group.num_seqs())
+            seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
+                                   len(seq_group.prompt_token_ids))
+            seq_span.set_attribute(
+                SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
+                sum([
+                    seq.get_output_len()
+                    for seq in seq_group.get_finished_seqs()
+                ]))
+            seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE,
+                                   metrics.time_in_queue)
+            seq_span.set_attribute(
+                SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
+            seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 769406124..744e1d945 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -31,6 +31,8 @@ from vllm.multimodal.utils import (async_get_and_parse_image,
                                    get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -267,11 +269,20 @@ class OpenAIServingChat(OpenAIServing):
         if image_data is not None:
             inputs["multi_modal_data"] = image_data
 
+        is_tracing_enabled = await self.engine.is_tracing_enabled()
+        trace_headers = None
+        if is_tracing_enabled and raw_request:
+            trace_headers = extract_trace_headers(raw_request.headers)
+        if not is_tracing_enabled and raw_request and contains_trace_headers(
+                raw_request.headers):
+            log_tracing_disabled_warning()
+
         result_generator = self.engine.generate(
             inputs,
             sampling_params,
             request_id,
             lora_request,
+            trace_headers=trace_headers,
         )
         # Streaming response
         if request.stream:
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 64671e21a..c775fa6da 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -24,6 +24,8 @@ from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
 from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
@@ -125,6 +127,14 @@ class OpenAIServingCompletion(OpenAIServing):
                         truncate_prompt_tokens)
                 prompt_ids, prompt_text = prompt_formats
 
+                is_tracing_enabled = await self.engine.is_tracing_enabled()
+                trace_headers = None
+                if is_tracing_enabled:
+                    trace_headers = extract_trace_headers(raw_request.headers)
+                if not is_tracing_enabled and contains_trace_headers(
+                        raw_request.headers):
+                    log_tracing_disabled_warning()
+
                 generator = self.engine.generate(
                     {
                         "prompt": prompt_text,
@@ -133,6 +143,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     sampling_params,
                     f"{request_id}-{i}",
                     lora_request=lora_request,
+                    trace_headers=trace_headers,
                 )
 
                 generators.append(generator)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 54243bfb1..38d3349f2 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -414,6 +414,7 @@ class SequenceGroup:
             for an embedding model.
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
+        trace_headers: OpenTelemetry trace headers.
     """
 
     def __init__(
@@ -426,6 +427,7 @@ class SequenceGroup:
         embeddings: Optional[List[float]] = None,
         pooling_params: Optional[PoolingParams] = None,
         encoder_seq: Optional[Sequence] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -441,6 +443,7 @@ class SequenceGroup:
         self.embeddings = embeddings
         self.pooling_params = pooling_params
         self.encoder_seq = encoder_seq
+        self.trace_headers = trace_headers
 
     @property
     def prompt(self) -> Optional[str]:
diff --git a/vllm/tracing.py b/vllm/tracing.py
new file mode 100644
index 000000000..ba6732cab
--- /dev/null
+++ b/vllm/tracing.py
@@ -0,0 +1,104 @@
+import os
+from typing import Mapping, Optional
+
+from vllm.logger import init_logger
+from vllm.utils import run_once
+
+TRACE_HEADERS = ["traceparent", "tracestate"]
+
+logger = init_logger(__name__)
+
+_is_otel_installed = False
+try:
+    from opentelemetry.context.context import Context
+    from opentelemetry.sdk.environment_variables import (
+        OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.semconv.ai import SpanAttributes as BaseSpanAttributes
+    from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
+    from opentelemetry.trace.propagation.tracecontext import (
+        TraceContextTextMapPropagator)
+    _is_otel_installed = True
+except ImportError:
+
+    class Context:  # type: ignore
+        pass
+
+    class BaseSpanAttributes:  # type: ignore
+        pass
+
+    class SpanKind:  # type: ignore
+        pass
+
+    class Tracer:  # type: ignore
+        pass
+
+
+def is_otel_installed() -> bool:
+    return _is_otel_installed
+
+
+def init_tracer(instrumenting_module_name: str,
+                otlp_traces_endpoint: str) -> Optional[Tracer]:
+    assert is_otel_installed(), ("OpenTelemetry packages must be installed "
+                                 "prior to initializing a tracer")
+    trace_provider = TracerProvider()
+
+    span_exporter = get_span_exporter(otlp_traces_endpoint)
+    trace_provider.add_span_processor(BatchSpanProcessor(span_exporter))
+    set_tracer_provider(trace_provider)
+
+    tracer = trace_provider.get_tracer(instrumenting_module_name)
+    return tracer
+
+
+def get_span_exporter(endpoint):
+    protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc")
+    if protocol == "grpc":
+        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+            OTLPSpanExporter)
+    elif protocol == "http/protobuf":
+        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
+            OTLPSpanExporter)
+    else:
+        raise ValueError(
+            f"Unsupported OTLP protocol '{protocol}' is configured")
+
+    return OTLPSpanExporter(endpoint=endpoint)
+
+
+def extract_trace_context(
+        headers: Optional[Mapping[str, str]]) -> Optional[Context]:
+    if is_otel_installed():
+        headers = headers or {}
+        return TraceContextTextMapPropagator().extract(headers)
+    else:
+        return None
+
+
+def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
+
+    return {h: headers[h] for h in TRACE_HEADERS if h in headers}
+
+
+class SpanAttributes(BaseSpanAttributes):
+    # The following span attribute names are added here because they are missing
+    # from the Semantic Conventions for LLM.
+    LLM_REQUEST_ID = "gen_ai.request.id"
+    LLM_REQUEST_BEST_OF = "gen_ai.request.best_of"
+    LLM_REQUEST_N = "gen_ai.request.n"
+    LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
+    LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
+    LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
+    LLM_LATENCY_E2E = "gen_ai.latency.e2e"
+
+
+def contains_trace_headers(headers: Mapping[str, str]) -> bool:
+    return any(h in headers for h in TRACE_HEADERS)
+
+
+@run_once
+def log_tracing_disabled_warning() -> None:
+    logger.warning(
+        "Received a request with trace context but tracing is disabled")
diff --git a/vllm/utils.py b/vllm/utils.py
index 1adfa9218..ffe921e65 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -763,3 +763,15 @@ def cuda_device_count_stateless() -> int:
     # after https://github.com/pytorch/pytorch/pull/122815 is released.
 
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
+
+
+#From: https://stackoverflow.com/a/4104188/2749989
+def run_once(f):
+
+    def wrapper(*args, **kwargs) -> Any:
+        if not wrapper.has_run:  # type: ignore[attr-defined]
+            wrapper.has_run = True  # type: ignore[attr-defined]
+            return f(*args, **kwargs)
+
+    wrapper.has_run = False  # type: ignore[attr-defined]
+    return wrapper
-- 
GitLab


From 95db455e7f337e99ffafd0b14367a7cbc11dca43 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 18 Jun 2024 12:45:05 -0400
Subject: [PATCH 086/376] [Misc] Add channel-wise quantization support for w8a8
 dynamic per token activation quantization (#5542)

---
 tests/quantization/test_compressed_tensors.py | 13 +++++--
 vllm/model_executor/layers/linear.py          | 13 -------
 .../compressed_tensors/compressed_tensors.py  | 14 ++++---
 .../compressed_tensors_w8a8_dynamictoken.py   | 37 ++++++++++++++-----
 4 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 611c6b8b7..b78081155 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -43,15 +43,19 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
 
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path) as llm:
         sampling_params = SamplingParams()
         output = llm.generate("Hello world!", sampling_params=sampling_params)
         assert output
 
 
-def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
+@pytest.mark.parametrize("model_args", [
+    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
+    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
+])
+def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
+    model_path, strategy = model_args
     with vllm_runner(model_path, dtype=torch.float16) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -60,6 +64,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
 
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
         assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+        assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
 
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 58c379bcd..45f805547 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -468,13 +468,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                     "MergedColumnParallelLinear, assume the weight is "
                     "the same for all partitions.")
 
-        if fp8_scales_shard_indexer is None:
-            if len(param_data.shape) == 0:
-                param_data = param_data.reshape(1)
-
-            if len(loaded_weight.shape) == 0:
-                loaded_weight = loaded_weight.reshape(1)
-
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -686,12 +679,6 @@ class QKVParallelLinear(ColumnParallelLinear):
                     "QKVParallelLinear, assume the weight is the same "
                     "for all partitions.")
 
-        if len(param_data.shape) == 0:
-            param_data = param_data.reshape(1)
-
-        if len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
-
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 92a84b3c0..347a052a6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -95,14 +95,15 @@ class CompressedTensorsConfig(QuantizationConfig):
     def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        is_token_tensor = (weight_quant.strategy
-                           == QuantizationStrategy.TENSOR.value) and (
-                               input_quant.strategy
-                               == QuantizationStrategy.TOKEN.value)
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
 
-        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
+        return is_8_bits and is_token and is_symmetric and is_dynamic
 
     def _is_w4a16(self, weight_quant: BaseModel,
                   input_quant: BaseModel) -> bool:
@@ -133,7 +134,8 @@ class CompressedTensorsConfig(QuantizationConfig):
                 return CompressedTensorsW8A8StaticTensor()
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8DynamicToken()
+                return CompressedTensorsW8A8DynamicToken(
+                    strategy=weight_quant.strategy)
 
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index d514d7b28..37610c9c2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -6,6 +6,8 @@ from torch.nn import Parameter
 from vllm import _custom_ops as custom_ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW8A8DynamicToken"]
@@ -13,6 +15,9 @@ __all__ = ["CompressedTensorsW8A8DynamicToken"]
 
 class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
 
+    def __init__(self, strategy: str):
+        self.strategy = strategy
+
     def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
             return shard_id
@@ -45,11 +50,17 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
         # CompressedTensorsW8A8StaticTensor::create_weights for further
         # information.
         is_tensor_partitioned = len(output_partition_sizes) != 1
-        weight_scale_dim = sum(
-            output_partition_sizes) if is_tensor_partitioned else 1
+        # when doing channel-wise quantization, number of scales
+        # is equal to output_dim
+        weight_scale_dim = sum(output_partition_sizes) if (
+            is_tensor_partitioned
+            or self.strategy == QuantizationStrategy.CHANNEL) else 1
+
+        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            shape = (weight_scale_dim, 1)
 
-        weight_scale = Parameter(torch.empty(weight_scale_dim,
-                                             dtype=torch.float32),
+        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
                                  requires_grad=False)
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),
@@ -67,12 +78,20 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
             })
 
         layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(
-            weight_scale, {
-                "weight_loader": weight_loader,
-                "shard_splitter": self.scales_shard_splitter,
-                "logical_widths": output_partition_sizes
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+
+        # Don't need a shard_splitter for channel-wise quantization
+        # Use the default loading method
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            set_weight_attrs(weight_scale, {
+                "output_dim": 0,
             })
+        else:
+            set_weight_attrs(
+                weight_scale, {
+                    "logical_widths": output_partition_sizes,
+                    "shard_splitter": self.scales_shard_splitter,
+                })
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         weight = layer.weight
-- 
GitLab


From 19091efc44c6f9b1e008dc5469c63a1f01684745 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 18 Jun 2024 11:00:36 -0700
Subject: [PATCH 087/376] [ci] Setup Release pipeline and build release wheels
 with cache (#5610)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/release-pipeline.yaml | 21 ++++++++++++
 Dockerfile                       | 58 ++++++++++++++++++++++----------
 2 files changed, 62 insertions(+), 17 deletions(-)
 create mode 100644 .buildkite/release-pipeline.yaml

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
new file mode 100644
index 000000000..1959f9752
--- /dev/null
+++ b/.buildkite/release-pipeline.yaml
@@ -0,0 +1,21 @@
+steps:
+  - block: "Build wheels"
+
+  - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" 
+    agents:
+      queue: cpu_queue
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+    matrix:
+      setup:
+        cuda_version:
+          - "11.8.0"
+          - "12.1.0"
+        python_version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
diff --git a/Dockerfile b/Dockerfile
index 72894e7ca..5b3e682a8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,9 +5,26 @@
 # docs/source/dev/dockerfile/dockerfile.rst and
 # docs/source/assets/dev/dockerfile-stages-dependency.png
 
+ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
+
+ARG CUDA_VERSION=12.4.1
+ARG PYTHON_VERSION=3
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version \
+    && python3 -m pip --version
 
 RUN apt-get update -y \
     && apt-get install -y python3-pip git curl sudo
@@ -16,7 +33,7 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.4/compat/
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
 WORKDIR /workspace
 
@@ -24,14 +41,7 @@ WORKDIR /workspace
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-cuda.txt
-
-# install development dependencies
-COPY requirements-lint.txt requirements-lint.txt
-COPY requirements-test.txt requirements-test.txt
-COPY requirements-dev.txt requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+    python3 -m pip install -r requirements-cuda.txt
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -41,14 +51,16 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
-
 #################### WHEEL BUILD IMAGE ####################
-FROM dev AS build
+FROM base AS build
+
+ARG PYTHON_VERSION=3
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
+
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
+    python3 -m pip install -r requirements-build.txt
 
 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
@@ -101,9 +113,21 @@ RUN python3 check-wheel-size.py dist
 
 #################### EXTENSION Build IMAGE ####################
 
+#################### DEV IMAGE ####################
+FROM base as dev
+
+COPY requirements-lint.txt requirements-lint.txt
+COPY requirements-test.txt requirements-test.txt
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-dev.txt
+
+#################### DEV IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+ARG CUDA_VERSION=12.4.1
 WORKDIR /vllm-workspace
 
 RUN apt-get update -y \
@@ -113,12 +137,12 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.4/compat/
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose
+    python3 -m pip install dist/*.whl --verbose
 #################### vLLM installation IMAGE ####################
 
 
@@ -131,7 +155,7 @@ ADD . /vllm-workspace/
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+    python3 -m pip install -r requirements-dev.txt
 
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
-- 
GitLab


From 07feecde1a69859d565786a7ad64c0f604f17b28 Mon Sep 17 00:00:00 2001
From: sergey-tinkoff <167607910+sergey-tinkoff@users.noreply.github.com>
Date: Tue, 18 Jun 2024 21:01:21 +0300
Subject: [PATCH 088/376] [Model] LoRA support added for command-r (#5178)

---
 csrc/punica/bgmv/bgmv_config.h         |  6 ++++
 tests/lora/test_punica.py              |  2 ++
 vllm/model_executor/models/commandr.py | 48 ++++++++++++++++++++++----
 3 files changed, 50 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 csrc/punica/bgmv/bgmv_config.h

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
old mode 100644
new mode 100755
index 0456b4bc2..c38db2dcd
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -69,6 +69,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 36864) \
     f(in_T, out_T, W_T, narrow, 43264) \
     f(in_T, out_T, W_T, narrow, 49152) \
+    f(in_T, out_T, W_T, narrow, 60544) \
+    f(in_T, out_T, W_T, narrow, 60672) \
     f(in_T, out_T, W_T, narrow, 64000) \
     f(in_T, out_T, W_T, narrow, 64256) \
     f(in_T, out_T, W_T, narrow, 64512) \
@@ -78,6 +80,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 128000) \
     f(in_T, out_T, W_T, narrow, 128256) \
     f(in_T, out_T, W_T, narrow, 128512) \
+    
+    
 // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
 // and vllm/tests/lora/test_punica.py
 
@@ -144,6 +148,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 36864, narrow) \
     f(in_T, out_T, W_T, 43264, narrow) \
     f(in_T, out_T, W_T, 49152, narrow) \
+    f(in_T, out_T, W_T, 60544, narrow) \
+    f(in_T, out_T, W_T, 60672, narrow) \
     f(in_T, out_T, W_T, 64000, narrow) \
     f(in_T, out_T, W_T, 64256, narrow) \
     f(in_T, out_T, W_T, 64512, narrow) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index d87658e5d..dae1d5687 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -94,6 +94,8 @@ H1 = H2 = [
     36864,
     43264,
     49152,
+    60544,
+    60672,
     64000,
     64256,
     102400,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 11d88d45e..600c2990b 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -29,7 +29,7 @@ from torch.nn.parameter import Parameter
 from transformers import CohereConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -265,10 +265,14 @@ class CohereModel(nn.Module):
         config: CohereConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
         self.config = config
-        self.vocab_size = config.vocab_size
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
                                                    config.hidden_size)
         self.layers = nn.ModuleList([
@@ -302,18 +306,44 @@ class CohereModel(nn.Module):
 
 class CohereForCausalLM(nn.Module):
 
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens"
+    ]
+    embedding_modules = {"embed_tokens": "input_embeddings"}
+    embedding_padding_modules = []
+
     def __init__(
         self,
         config: CohereConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
         self.config = config
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.quant_config = quant_config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
                                                 scale=config.logit_scale)
-        self.model = CohereModel(config, cache_config, quant_config)
+        self.model = CohereModel(config,
+                                 cache_config,
+                                 quant_config,
+                                 lora_config=lora_config)
         self.sampler = Sampler()
 
     @torch.no_grad()
@@ -330,8 +360,14 @@ class CohereForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.model.embed_tokens.weight,
-                                       hidden_states, sampling_metadata)
+        is_not_lora = hasattr(self.model.embed_tokens, 'weight')
+        if is_not_lora:
+            embedding_weights = self.model.embed_tokens.weight
+        else:
+            embedding_weights = self.model.embed_tokens.base_layer.weight
+
+        logits = self.logits_processor(embedding_weights, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def sample(
-- 
GitLab


From 8a173382c80d6730e1bbc81f932ac3721ab2cd9d Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 18 Jun 2024 23:18:37 +0200
Subject: [PATCH 089/376] [Bugfix] Fix for inconsistent behaviour related to
 sampling and repetition penalties  (#5639)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/sampling_metadata.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 7ad84f51b..f95de56f3 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -386,18 +386,10 @@ class SamplingTensors:
                 presence_penalties += [0] * prefill_len
                 frequency_penalties += [0] * prefill_len
                 repetition_penalties += [1] * prefill_len
-                if do_penalties:
-                    prompt_tokens.extend([] for _ in range(prefill_len))
-                    output_tokens.extend([] for _ in range(prefill_len))
 
             if seq_group.do_sample:
                 sample_lens = len(seq_group.sample_indices)
                 assert sample_lens == len(seq_ids)
-                for seq_id in seq_ids:
-                    seq_data = seq_group.seq_data[seq_id]
-                    if do_penalties:
-                        prompt_tokens.append(seq_data.prompt_token_ids)
-                        output_tokens.append(seq_data.output_token_ids)
                 temperatures += [temperature] * len(seq_ids)
                 top_ps += [top_p] * len(seq_ids)
                 top_ks += [top_k] * len(seq_ids)
@@ -424,6 +416,20 @@ class SamplingTensors:
                 sampling_seeds.append(seq_seeds)
             sample_indices.extend(seq_group.sample_indices)
 
+        if do_penalties:
+            for seq_group in sampling_metadata.seq_groups:
+                seq_ids = seq_group.seq_ids
+                if (seq_group.is_prompt
+                        and sampling_params.prompt_logprobs is not None):
+                    prefill_len = len(seq_group.prompt_logprob_indices)
+                    prompt_tokens.extend([] for _ in range(prefill_len))
+                    output_tokens.extend([] for _ in range(prefill_len))
+                if seq_group.do_sample:
+                    for seq_id in seq_ids:
+                        seq_data = seq_group.seq_data[seq_id]
+                        prompt_tokens.append(seq_data.prompt_token_ids)
+                        output_tokens.append(seq_data.output_token_ids)
+
         sampling_tensors = SamplingTensors.from_lists(
             temperatures, top_ps, top_ks, min_ps, presence_penalties,
             frequency_penalties, repetition_penalties, sampling_seeds,
-- 
GitLab


From 2bd231a7b7787407ccba36f966603578842d03f7 Mon Sep 17 00:00:00 2001
From: milo157 <43028253+milo157@users.noreply.github.com>
Date: Tue, 18 Jun 2024 18:56:59 -0400
Subject: [PATCH 090/376] [Doc] Added cerebrium as Integration option (#5553)

---
 .../serving/deploying_with_cerebrium.rst      | 109 ++++++++++++++++++
 docs/source/serving/integrations.rst          |   1 +
 2 files changed, 110 insertions(+)
 create mode 100644 docs/source/serving/deploying_with_cerebrium.rst

diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst
new file mode 100644
index 000000000..ff0ac9111
--- /dev/null
+++ b/docs/source/serving/deploying_with_cerebrium.rst
@@ -0,0 +1,109 @@
+.. _deploying_with_cerebrium:
+
+Deploying with Cerebrium
+============================
+
+.. raw:: html
+
+    <p align="center">
+        <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
+    </p>
+
+vLLM can be run on a cloud based GPU machine with `Cerebrium <https://www.cerebrium.ai/>`__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
+
+To install the Cerebrium client, run:
+
+.. code-block:: console
+
+    $ pip install cerebrium
+    $ cerebrium login
+
+Next, create your Cerebrium project, run:
+    
+.. code-block:: console
+
+    $ cerebrium init vllm-project
+
+Next, to install the required packages, add the following to your cerebrium.toml:
+
+.. code-block:: toml
+
+    [cerebrium.dependencies.pip]
+    vllm = "latest"
+
+Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`:
+    
+.. code-block:: python
+
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
+
+    def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
+    
+        sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        results = []
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            results.append({"prompt": prompt, "generated_text": generated_text})
+
+        return {"results": results}
+
+
+Then, run the following code to deploy it to the cloud
+
+.. code-block:: console
+
+    $ cerebrium deploy
+
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
+
+.. code-block:: python
+
+    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
+     -H 'Content-Type: application/json' \
+     -H 'Authorization: <JWT TOKEN>' \
+     --data '{
+       "prompts": [
+         "Hello, my name is",
+         "The president of the United States is",
+         "The capital of France is",
+         "The future of AI is"
+       ]
+     }'
+
+You should get a response like:
+
+.. code-block:: python
+    
+    {
+        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
+        "result": {
+            "result": [
+                {
+                    "prompt": "Hello, my name is",
+                    "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
+                },
+                {
+                    "prompt": "The president of the United States is",
+                    "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
+                },
+                {
+                    "prompt": "The capital of France is",
+                    "generated_text": " Paris.\n"
+                },
+                {
+                    "prompt": "The future of AI is",
+                    "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
+                }
+            ]
+        },
+        "run_time_ms": 152.53663063049316
+    }
+
+You now have an autoscaling endpoint where you only pay for the compute you use!
+
diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
index 83a8b5a88..680ea523d 100644
--- a/docs/source/serving/integrations.rst
+++ b/docs/source/serving/integrations.rst
@@ -8,6 +8,7 @@ Integrations
    deploying_with_kserve
    deploying_with_triton
    deploying_with_bentoml
+   deploying_with_cerebrium
    deploying_with_lws
    deploying_with_dstack
    serving_with_langchain
-- 
GitLab


From b23ce9203235488e080434108d3504d54b24e867 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 18 Jun 2024 19:48:49 -0400
Subject: [PATCH 091/376] [Bugfix] Fix CUDA version check for mma warning
 suppression (#5642)

---
 csrc/quantization/marlin/sparse/common/mma.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
index 8a6c65338..b26505f77 100644
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -17,6 +17,7 @@
 
 #pragma once
 #include "base.h"
+#include <cudaTypedefs.h>
 
 namespace marlin_24 {
 
@@ -26,7 +27,7 @@ namespace marlin_24 {
 //  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
 //  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
 //  | reduced performance on some future architectures
-#if defined CUDA_VERSION && CUDA_VERSION >= 12500
+#if defined CUDA_VERSION && CUDA_VERSION >= 12050
   #define MMA_SP_INST \
     "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
 #else
-- 
GitLab


From 6820724e51079120251c8522afd385ca64abc948 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 18 Jun 2024 20:33:25 -0400
Subject: [PATCH 092/376] [Bugfix] Fix w8a8 benchmarks for int8 case (#5643)

---
 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 523e970c2..5cc0fbbd4 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -120,9 +120,8 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     # cutlass impl
     timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.bfloat16, label, sub_label, cutlass_impl,
-                 "cutlass_i8_i8_bf16_scaled_mm"))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
 
     return timers
 
-- 
GitLab


From 59a1eb59c9cb383e5ea36d7253f81ff2ea7766cc Mon Sep 17 00:00:00 2001
From: Shukant Pal <SukantK2002@outlook.com>
Date: Tue, 18 Jun 2024 18:46:38 -0700
Subject: [PATCH 093/376] [Bugfix] Fix Phi-3 Long RoPE scaling implementation
 (#5628)

---
 vllm/model_executor/layers/rotary_embedding.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 9c0a74cda..a0b19046b 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -507,8 +507,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         dtype: torch.dtype,
         short_factor: List[float],
         long_factor: List[float],
-        short_mscale: float = 1.1,
-        long_mscale: float = 1.225,
+        short_mscale: float = 1.0,
+        long_mscale: float = 1.0,
     ):
         super().__init__()
 
@@ -530,6 +530,16 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         self.short_mscale = short_mscale
         self.long_mscale = long_mscale
 
+        scale = (self.max_position_embeddings /
+                 self.original_max_position_embeddings)
+
+        if scale <= 1.0:
+            self.scaling_factor = 1.0
+        else:
+            self.scaling_factor = math.sqrt(
+                1 + math.log(scale) /
+                math.log(self.original_max_position_embeddings))
+
         short_cache = self._compute_cos_sin_cache(
             original_max_position_embeddings, short_factor, short_mscale)
         short_cache = short_cache.to(dtype)
@@ -565,8 +575,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         inv_freq = self._compute_inv_freq(rescale_factors)
         t = torch.arange(max_position_embeddings, dtype=torch.float)
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos() * mscale
-        sin = freqs.sin() * mscale
+        cos = freqs.cos() * mscale * self.scaling_factor
+        sin = freqs.sin() * mscale * self.scaling_factor
         cache = torch.cat((cos, sin), dim=-1)
         return cache
 
-- 
GitLab


From e5150f2c281f052df42121ae60827156abe57173 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 19 Jun 2024 08:03:55 +0200
Subject: [PATCH 094/376] [Bugfix] Added test for sampling repetition penalty
 bug. (#5659)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/samplers/test_sampler.py | 69 ++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index c6ef4358e..0aabde6aa 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -631,3 +631,72 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
     assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
     assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_repetition_penalty_mixed(device: str):
+
+    vocab_size = 8
+
+    def test_sampling_params(sampling_params: List[SamplingParams]):
+
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        seq_lens: List[int] = []
+        for i in range(2):
+            seq_group_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{i}",
+                    is_prompt=True,
+                    seq_data={0: SequenceData([1, 2, 3])},
+                    sampling_params=sampling_params[i],
+                    block_tables={0: [1]},
+                ))
+            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=is_pin_memory_available())
+
+        fake_logits = torch.full((2, vocab_size),
+                                 1e-2,
+                                 device=device,
+                                 dtype=torch.float16)
+
+        fake_logits[:, 5] = 1.1e-2
+        fake_logits[:, 1] = 1.2e-2
+
+        sampler = MockLogitsSampler(fake_logits)
+
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        generated_tokens = []
+        for output in sampler_output:
+            generated_tokens.append(output.samples[0].output_token)
+
+        return generated_tokens
+
+    # one configuration is greedy with repetition_penalty
+    sampling_params_rep = SamplingParams(
+        temperature=0.0,
+        repetition_penalty=2.0,
+    )
+
+    # other configuration is sampling w/o repetition_penalty
+    sampling_params_sample = SamplingParams(
+        temperature=1.0,
+        top_k=1,
+        seed=42,
+    )
+
+    tokens1 = test_sampling_params(
+        [sampling_params_rep, sampling_params_sample])
+
+    tokens2 = test_sampling_params(
+        [sampling_params_sample, sampling_params_rep])
+
+    assert tokens1[0] == tokens2[1]
+    assert tokens1[1] == tokens2[0]
-- 
GitLab


From f758aed0e851687e919a4ee09ab872ee2c8fe159 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Wed, 19 Jun 2024 02:21:29 -0400
Subject: [PATCH 095/376] [Bugfix][CI/Build][AMD][ROCm]Fixed the cmake build
 bug which generate garbage on certain devices (#5641)

---
 Dockerfile.rocm   | 17 ++++++++---------
 cmake/utils.cmake |  5 ++++-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 724fa1673..6bda69685 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -7,9 +7,8 @@ ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
 RUN echo "Base image is $BASE_IMAGE"
 
-# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
-# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
+ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
+    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
@@ -68,7 +67,7 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
     && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+    && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \
         patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
     && python3 setup.py install \
     && cd ..; \
@@ -76,7 +75,7 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
 
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
-RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
     rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
 # build triton
@@ -107,11 +106,11 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
+    && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
+       patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \
     && python3 setup.py install \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
+    && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \
+    && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \
     && cd ..
 
 
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index f3c1286dd..071e16336 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -155,8 +155,11 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     # Find the intersection of the supported + detected architectures to
     # set the module architecture flags.
     #
+
+    set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+
     set(${GPU_ARCHES})
-    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
+    foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS})
       if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
         list(APPEND ${GPU_ARCHES} ${_ARCH})
       endif()
-- 
GitLab


From 3eea74889fe29534808bae41fca251e0e74c0962 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 19 Jun 2024 01:05:00 -0700
Subject: [PATCH 096/376] [misc][distributed] use 127.0.0.1 for single-node
 (#5619)

---
 vllm/executor/multiproc_gpu_executor.py |  7 +++++--
 vllm/executor/ray_gpu_executor.py       | 10 ++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 8385e56f8..e63e5a3a0 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -10,7 +10,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (cuda_device_count_stateless,
-                        get_distributed_init_method, get_ip, get_open_port,
+                        get_distributed_init_method, get_open_port,
                         get_vllm_instance_id, make_async)
 
 logger = init_logger(__name__)
@@ -37,8 +37,11 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
         assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")
 
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # 127.0.0.1 for communication.
         distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
+            "127.0.0.1", get_open_port())
 
         if world_size == 1:
             self.workers = []
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 843332e5e..fc83c5528 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -161,6 +161,16 @@ class RayGPUExecutor(DistributedGPUExecutor):
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
 
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
-- 
GitLab


From da971ec7a5b35f33981cff9ca50064d3166953f9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 19 Jun 2024 05:38:26 -0400
Subject: [PATCH 097/376] [Model] Add FP8 kv cache for Qwen2 (#5656)

---
 vllm/model_executor/models/qwen2.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 9a4829a27..b5d13bb6b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -46,6 +46,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
+from vllm.utils import print_warning_once
 
 
 class Qwen2MLP(nn.Module):
@@ -375,6 +376,19 @@ class Qwen2ForCausalLM(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                if name.endswith("kv_scale"):
+                    remapped_kv_scale_name = name.replace(
+                        ".kv_scale", ".attn.kv_scale")
+                    if remapped_kv_scale_name not in params_dict:
+                        print_warning_once(
+                            f"Found kv scale in the checkpoint (e.g. {name}), "
+                            "but not found the expected name in the model "
+                            f"(e.g. {remapped_kv_scale_name}). kv-scale is "
+                            "not loaded.")
+                        continue
+                    else:
+                        name = remapped_kv_scale_name
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-- 
GitLab


From 7d46c8d37864993162bbeb61dc19b5ad6043646d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 19 Jun 2024 17:58:32 +0800
Subject: [PATCH 098/376] [Bugfix] Fix sampling_params passed incorrectly in
 Phi3v example (#5684)

---
 examples/phi3v_example.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index d5e60ae1e..4f37c47dd 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -12,7 +12,6 @@ def run_phi3v():
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
         image_input_type="pixel_values",
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
@@ -28,11 +27,12 @@ def run_phi3v():
 
     sampling_params = SamplingParams(temperature=0, max_tokens=64)
 
-    outputs = llm.generate({
-        "prompt": prompt,
-        "sampling_params": sampling_params,
-        "multi_modal_data": ImagePixelData(image),
-    })
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": ImagePixelData(image),
+        },
+        sampling_params=sampling_params)
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
-- 
GitLab


From d8714530d11603a159a46ea0dde299f95807cfde Mon Sep 17 00:00:00 2001
From: DearPlanet <junsong.zhang2021.work@outlook.com>
Date: Wed, 19 Jun 2024 18:19:08 +0800
Subject: [PATCH 099/376] [Misc]Add param max-model-len in benchmark_latency.py
 (#5629)

---
 benchmarks/benchmark_latency.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 98e0be277..e9d1048c8 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -29,6 +29,7 @@ def main(args: argparse.Namespace):
         tensor_parallel_size=args.tensor_parallel_size,
         trust_remote_code=args.trust_remote_code,
         dtype=args.dtype,
+        max_model_len=args.max_model_len,
         enforce_eager=args.enforce_eager,
         kv_cache_dtype=args.kv_cache_dtype,
         quantization_param_path=args.quantization_param_path,
@@ -150,6 +151,12 @@ if __name__ == '__main__':
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
     parser.add_argument(
         '--dtype',
         type=str,
-- 
GitLab


From e9c2732b976612b6362635be2984f03bfabc20ec Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 19 Jun 2024 22:37:33 +0800
Subject: [PATCH 100/376] [CI/Build] Add tqdm to dependencies (#5680)

---
 requirements-common.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-common.txt b/requirements-common.txt
index 32e2ebe8c..05969cfa5 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -4,6 +4,7 @@ psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
 requests
+tqdm
 py-cpuinfo
 transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
 tokenizers >= 0.19.1  # Required for Llama 3.
-- 
GitLab


From 3ee5c4bca514ee95592a018fae95e050fd6763c0 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 19 Jun 2024 07:42:13 -0700
Subject: [PATCH 101/376] [ci] Add A100 queue into AWS CI template (#5648)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .../benchmark-pipeline.yaml                   |  1 +
 .buildkite/test-pipeline.yaml                 |  5 ++
 .buildkite/test-template-aws.j2               | 46 +++++++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 8f12748b6..2b25c954b 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -17,6 +17,7 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
+          priorityClassName: perf-benchmark
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
             command:
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5afe37302..c2160fee3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -181,3 +181,8 @@ steps:
   commands:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
+
+- label: A100 status
+  gpu: a100
+  commands: 
+  - nvidia-smi
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 01f7ff1e0..08146bf44 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -49,6 +49,51 @@ steps:
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}
+  {% if step.gpu == "a100" %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: a100-queue
+    soft_fail: {{ step.soft_fail or false }}
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: ci
+          containers:
+          - image: {{ docker_image }}
+            command: ["bash"]
+            args:
+            - '-c'
+            - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  {% else %}
   - label: "{{ step.label }}"
     agents:
       {% if step.label == "Documentation Build" %}
@@ -90,4 +135,5 @@ steps:
             {% endif %}
           volumes:
             - /dev/shm:/dev/shm
+  {% endif %}
   {% endfor %}
-- 
GitLab


From afed90a0344b1b0ce6aae46efc630adb489ec769 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 19 Jun 2024 14:41:42 -0400
Subject: [PATCH 102/376] [Frontend][Bugfix] Fix preemption_mode ->
 preemption-mode for CLI arg in arg_utils.py (#5688)

---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 647793a6d..7f760c277 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -577,7 +577,7 @@ class EngineArgs:
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
         parser.add_argument(
-            '--preemption_mode',
+            '--preemption-mode',
             type=str,
             default=None,
             help='If \'recompute\', the engine performs preemption by block '
-- 
GitLab


From d571ca010813c5532c646ee74c1a2c9e1e78e12a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 19 Jun 2024 13:16:04 -0700
Subject: [PATCH 103/376] [ci][distributed] add tests for custom allreduce
 (#5689)

---
 .buildkite/test-pipeline.yaml               | 8 ++++++--
 tests/distributed/test_custom_all_reduce.py | 7 ++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c2160fee3..b1602dd94 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -182,7 +182,11 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
 
-- label: A100 status
+- label: Distributed Tests (A100)
   gpu: a100
   commands: 
-  - nvidia-smi
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 3776c1f91..9a39160b8 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -11,7 +11,8 @@ from vllm.distributed.communication_op import (  # noqa
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
                                              get_tp_group, graph_capture)
 
-from ..utils import (init_test_distributed_environment,
+from ..utils import (ensure_model_parallel_initialized,
+                     init_test_distributed_environment,
                      multi_process_tensor_parallel)
 
 random.seed(42)
@@ -27,8 +28,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
                                       distributed_init_port)
-
-    group = get_tensor_model_parallel_group()
+    ensure_model_parallel_initialized(tp_size, pp_size)
+    group = get_tensor_model_parallel_group().device_group
 
     # A small all_reduce for warmup.
     # this is needed because device communicators might be created lazily
-- 
GitLab


From 78687504f7eb6d7523bff15b1bca8c9cbb74656a Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Wed, 19 Jun 2024 13:57:12 -0700
Subject: [PATCH 104/376] [Bugfix] AsyncLLMEngine hangs with asyncio.run
 (#5654)

---
 tests/async_engine/test_async_llm_engine.py |  38 +++-
 tests/spec_decode/e2e/conftest.py           |  43 +----
 tests/utils.py                              |  43 ++++-
 vllm/engine/async_llm_engine.py             |   5 +-
 vllm/engine/async_timeout.py                | 189 ++++++++++++++++++++
 5 files changed, 271 insertions(+), 47 deletions(-)
 create mode 100644 vllm/engine/async_timeout.py

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 10a464228..52d3394a9 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -2,8 +2,12 @@ import asyncio
 from dataclasses import dataclass
 
 import pytest
+import torch
 
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm import SamplingParams
+from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+
+from ..utils import wait_for_gpu_memory_to_clear
 
 
 @dataclass
@@ -94,3 +98,35 @@ async def test_new_requests_event():
     assert engine.get_model_config() is not None
     assert engine.get_tokenizer() is not None
     assert engine.get_decoding_config() is not None
+
+
+def test_asyncio_run():
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(torch.cuda.device_count())),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    engine = AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m"))
+
+    async def run(prompt: str):
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=32,
+        )
+
+        async for output in engine.generate(prompt,
+                                            sampling_params,
+                                            request_id=prompt):
+            final_output = output
+        return final_output
+
+    async def generate():
+        return await asyncio.gather(
+            run("test0"),
+            run("test1"),
+        )
+
+    results = asyncio.run(generate())
+    assert len(results) == 2
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 86103cf85..60dfe33f2 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,5 +1,4 @@
 import asyncio
-import time
 from itertools import cycle
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -7,12 +6,6 @@ import pytest
 import ray
 import torch
 
-from vllm.utils import is_hip
-
-if (not is_hip()):
-    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                        nvmlInit)
-
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -26,6 +19,7 @@ from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, random_uuid
 
 from ...conftest import cleanup
+from ...utils import wait_for_gpu_memory_to_clear
 
 
 class AsyncLLM:
@@ -291,38 +285,3 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
         print(f'{i=} {baseline_token_ids=}')
         print(f'{i=}     {spec_token_ids=}')
         assert baseline_token_ids == spec_token_ids
-
-
-def wait_for_gpu_memory_to_clear(devices: List[int],
-                                 threshold_bytes: int,
-                                 timeout_s: float = 120) -> None:
-    # Use nvml instead of pytorch to reduce measurement error from torch cuda
-    # context.
-    nvmlInit()
-    start_time = time.time()
-    while True:
-        output: Dict[int, str] = {}
-        output_raw: Dict[int, float] = {}
-        for device in devices:
-            dev_handle = nvmlDeviceGetHandleByIndex(device)
-            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
-            gb_used = mem_info.used / 2**30
-            output_raw[device] = gb_used
-            output[device] = f'{gb_used:.02f}'
-
-        print('gpu memory used (GB): ', end='')
-        for k, v in output.items():
-            print(f'{k}={v}; ', end='')
-        print('')
-
-        dur_s = time.time() - start_time
-        if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
-            print(f'Done waiting for free GPU memory on devices {devices=} '
-                  f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
-            break
-
-        if dur_s >= timeout_s:
-            raise ValueError(f'Memory of devices {devices=} not free after '
-                             f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
-
-        time.sleep(5)
diff --git a/tests/utils.py b/tests/utils.py
index f2b2d22b1..bc30515c8 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,7 +4,7 @@ import sys
 import time
 import warnings
 from contextlib import contextmanager
-from typing import List
+from typing import Dict, List
 
 import openai
 import ray
@@ -13,7 +13,11 @@ import requests
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.utils import get_open_port
+from vllm.utils import get_open_port, is_hip
+
+if (not is_hip()):
+    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
+                        nvmlInit)
 
 # Path to root of repository so that utilities can be imported by ray workers
 VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
@@ -154,3 +158,38 @@ def error_on_warning():
         warnings.simplefilter("error")
 
         yield
+
+
+def wait_for_gpu_memory_to_clear(devices: List[int],
+                                 threshold_bytes: int,
+                                 timeout_s: float = 120) -> None:
+    # Use nvml instead of pytorch to reduce measurement error from torch cuda
+    # context.
+    nvmlInit()
+    start_time = time.time()
+    while True:
+        output: Dict[int, str] = {}
+        output_raw: Dict[int, float] = {}
+        for device in devices:
+            dev_handle = nvmlDeviceGetHandleByIndex(device)
+            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+            gb_used = mem_info.used / 2**30
+            output_raw[device] = gb_used
+            output[device] = f'{gb_used:.02f}'
+
+        print('gpu memory used (GB): ', end='')
+        for k, v in output.items():
+            print(f'{k}={v}; ', end='')
+        print('')
+
+        dur_s = time.time() - start_time
+        if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
+            print(f'Done waiting for free GPU memory on devices {devices=} '
+                  f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
+            break
+
+        if dur_s >= timeout_s:
+            raise ValueError(f'Memory of devices {devices=} not free after '
+                             f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
+
+        time.sleep(5)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 86720e4fb..df25eb111 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -10,6 +10,7 @@ import vllm.envs as envs
 from vllm.config import DecodingConfig, ModelConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_timeout import asyncio_timeout
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.ray_utils import initialize_ray_cluster, ray
 from vllm.inputs import LLMInputs, PromptInputs
@@ -545,8 +546,8 @@ class AsyncLLMEngine:
             # Abort if iteration takes too long due to unrecoverable errors
             # (eg. NCCL timeouts).
             try:
-                has_requests_in_progress = await asyncio.wait_for(
-                    self.engine_step(), ENGINE_ITERATION_TIMEOUT_S)
+                async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
+                    has_requests_in_progress = await self.engine_step()
             except asyncio.TimeoutError as exc:
                 logger.error(
                     "Engine iteration timed out. This should never happen!")
diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
new file mode 100644
index 000000000..4b1842625
--- /dev/null
+++ b/vllm/engine/async_timeout.py
@@ -0,0 +1,189 @@
+# Workaround for https://github.com/python/cpython/issues/86296
+#
+# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
+# Licensed under the Apache License (Apache-2.0)
+
+import asyncio
+import enum
+import sys
+import warnings
+from types import TracebackType
+from typing import Any, Optional, Type
+
+if sys.version_info[:2] >= (3, 11):
+    from asyncio import timeout as asyncio_timeout
+else:
+
+    def asyncio_timeout(delay: Optional[float]) -> "Timeout":
+        """timeout context manager.
+        Useful in cases when you want to apply timeout logic around block
+        of code or in cases when asyncio.wait_for is not suitable. For example:
+        >>> async with timeout(0.001):
+        ...     async with aiohttp.get('https://github.com') as r:
+        ...         await r.text()
+        delay - value in seconds or None to disable timeout logic
+        """
+        loop = asyncio.get_running_loop()
+        deadline = loop.time() + delay if delay is not None else None
+        return Timeout(deadline, loop)
+
+    class _State(enum.Enum):
+        INIT = "INIT"
+        ENTER = "ENTER"
+        TIMEOUT = "TIMEOUT"
+        EXIT = "EXIT"
+
+    class Timeout:
+        # Internal class, please don't instantiate it directly
+        # Use timeout() and timeout_at() public factories instead.
+        #
+        # Implementation note: `async with timeout()` is preferred
+        # over `with timeout()`.
+        # While technically the Timeout class implementation
+        # doesn't need to be async at all,
+        # the `async with` statement explicitly points that
+        # the context manager should be used from async function context.
+        #
+        # This design allows to avoid many silly misusages.
+        #
+        # TimeoutError is raised immediately when scheduled
+        # if the deadline is passed.
+        # The purpose is to time out as soon as possible
+        # without waiting for the next await expression.
+
+        __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler")
+
+        def __init__(self, deadline: Optional[float],
+                     loop: asyncio.AbstractEventLoop) -> None:
+            self._loop = loop
+            self._state = _State.INIT
+
+            self._timeout_handler = None  # type: Optional[asyncio.Handle]
+            if deadline is None:
+                self._deadline = None  # type: Optional[float]
+            else:
+                self.update(deadline)
+
+        def __enter__(self) -> "Timeout":
+            warnings.warn(
+                "with timeout() is deprecated, use async with timeout()",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            self._do_enter()
+            return self
+
+        def __exit__(
+            self,
+            exc_type: Optional[Type[BaseException]],
+            exc_val: Optional[BaseException],
+            exc_tb: Optional[TracebackType],
+        ) -> Optional[bool]:
+            self._do_exit(exc_type)
+            return None
+
+        async def __aenter__(self) -> "Timeout":
+            self._do_enter()
+            return self
+
+        async def __aexit__(
+            self,
+            exc_type: Optional[Type[BaseException]],
+            exc_val: Optional[BaseException],
+            exc_tb: Optional[TracebackType],
+        ) -> Optional[bool]:
+            self._do_exit(exc_type)
+            return None
+
+        @property
+        def expired(self) -> bool:
+            """Is timeout expired during execution?"""
+            return self._state == _State.TIMEOUT
+
+        @property
+        def deadline(self) -> Optional[float]:
+            return self._deadline
+
+        def reject(self) -> None:
+            """Reject scheduled timeout if any."""
+            # cancel is maybe better name but
+            # task.cancel() raises CancelledError in asyncio world.
+            if self._state not in (_State.INIT, _State.ENTER):
+                raise RuntimeError(f"invalid state {self._state.value}")
+            self._reject()
+
+        def _reject(self) -> None:
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+                self._timeout_handler = None
+
+        def shift(self, delay: float) -> None:
+            """Advance timeout on delay seconds.
+            The delay can be negative.
+            Raise RuntimeError if shift is called when deadline is not scheduled
+            """
+            deadline = self._deadline
+            if deadline is None:
+                raise RuntimeError(
+                    "cannot shift timeout if deadline is not scheduled")
+            self.update(deadline + delay)
+
+        def update(self, deadline: float) -> None:
+            """Set deadline to absolute value.
+            deadline argument points on the time in the same clock system
+            as loop.time().
+            If new deadline is in the past the timeout is raised immediately.
+            Please note: it is not POSIX time but a time with
+            undefined starting base, e.g. the time of the system power on.
+            """
+            if self._state == _State.EXIT:
+                raise RuntimeError(
+                    "cannot reschedule after exit from context manager")
+            if self._state == _State.TIMEOUT:
+                raise RuntimeError("cannot reschedule expired timeout")
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+            self._deadline = deadline
+            if self._state != _State.INIT:
+                self._reschedule()
+
+        def _reschedule(self) -> None:
+            assert self._state == _State.ENTER
+            deadline = self._deadline
+            if deadline is None:
+                return
+
+            now = self._loop.time()
+            if self._timeout_handler is not None:
+                self._timeout_handler.cancel()
+
+            task = asyncio.current_task()
+            if deadline <= now:
+                self._timeout_handler = self._loop.call_soon(
+                    self._on_timeout, task)
+            else:
+                self._timeout_handler = self._loop.call_at(
+                    deadline, self._on_timeout, task)
+
+        def _do_enter(self) -> None:
+            if self._state != _State.INIT:
+                raise RuntimeError(f"invalid state {self._state.value}")
+            self._state = _State.ENTER
+            self._reschedule()
+
+        def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None:
+            if exc_type is asyncio.CancelledError and \
+                    self._state == _State.TIMEOUT:
+                self._timeout_handler = None
+                raise asyncio.TimeoutError
+            # timeout has not expired
+            self._state = _State.EXIT
+            self._reject()
+            return None
+
+        def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None:
+            if task:
+                task.cancel()
+            self._state = _State.TIMEOUT
+            # drop the reference early
+            self._timeout_handler = None
-- 
GitLab


From e83db9e7e3d776cd9b059a49024f3950ef579b41 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Wed, 19 Jun 2024 18:01:45 -0400
Subject: [PATCH 105/376] [Doc] Update docker references (#5614)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 docs/source/dev/dockerfile/dockerfile.rst     | 20 +++++++++----------
 docs/source/serving/deploying_with_docker.rst |  7 +++----
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/dev/dockerfile/dockerfile.rst
index a07463392..40ba818eb 100644
--- a/docs/source/dev/dockerfile/dockerfile.rst
+++ b/docs/source/dev/dockerfile/dockerfile.rst
@@ -2,19 +2,19 @@ Dockerfile
 ====================
 
 See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct 
-the image for running an OpenAI compatible server with vLLM.
+the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`_.
 
--  Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
+Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
-   - All build stages
-   - The default build target (highlighted in grey)
-   - External images (with dashed borders)
+- All build stages
+- The default build target (highlighted in grey)
+- External images (with dashed borders)
    
-   The edges of the build graph represent:
-   
-   - FROM ... dependencies (with a solid line and a full arrow head)
-   - COPY --from=... dependencies (with a dashed line and an empty arrow head)
-   - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+The edges of the build graph represent:
+
+- FROM ... dependencies (with a solid line and a full arrow head)
+- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
 
    .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
       :alt: query
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index fa82bc8e3..14d94b09e 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -3,9 +3,8 @@
 Deploying with Docker
 ============================
 
-vLLM offers official docker image for deployment.
-The image can be used to run OpenAI compatible server.
-The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
+vLLM offers an official Docker image for deployment.
+The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
 
 .. code-block:: console
 
@@ -25,7 +24,7 @@ The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.co
         memory to share data between processes under the hood, particularly for tensor parallel inference.
 
 
-You can build and run vLLM from source via the provided dockerfile. To build vLLM:
+You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
 
 .. code-block:: console
 
-- 
GitLab


From 4a30d7e3ccae6e977d728e2157aaa11ac0fed549 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 19 Jun 2024 18:06:44 -0400
Subject: [PATCH 106/376] [Misc] Add per channel support for static activation
 quantization; update w8a8 schemes to share base classes (#5650)

---
 tests/quantization/test_compressed_tensors.py | 14 ++-
 .../compressed_tensors/compressed_tensors.py  | 10 ++-
 .../schemes/compressed_tensors_w8a8.py        | 84 +++++++++++++++++
 .../compressed_tensors_w8a8_dynamictoken.py   | 89 +++----------------
 .../compressed_tensors_w8a8_statictensor.py   | 60 +++----------
 5 files changed, 121 insertions(+), 136 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index b78081155..aaa366335 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -13,8 +13,12 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
     CompressedTensorsW8A8StaticTensor)
 
 
-def test_compressed_tensors_w8a8_static_setup(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
+@pytest.mark.parametrize("model_args", [
+    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor"),
+    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel"),
+])
+def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
+    model_path, strategy = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -33,12 +37,14 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
 
         assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
 
+        assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
         assert o_proj.weight.dtype is torch.int8
         assert gate_up_proj.weight.dtype is torch.int8
 
-        assert qkv_proj.weight_scale.shard_splitter is not None
-        assert qkv_proj.weight_scale.logical_widths is not None
+        if qkv_proj.scheme.strategy == "tensor":
+            assert qkv_proj.weight_scale.shard_splitter is not None
+            assert qkv_proj.weight_scale.logical_widths is not None
         assert qkv_proj.input_scale.dtype is torch.float32
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 347a052a6..44dd024af 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -85,8 +85,11 @@ class CompressedTensorsConfig(QuantizationConfig):
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        is_tensor = (weight_quant.strategy == input_quant.strategy ==
-                     QuantizationStrategy.TENSOR.value)
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_tensor = (weight_strategy and input_quant.strategy
+                     == QuantizationStrategy.TENSOR.value)
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         is_static = not weight_quant.dynamic and not input_quant.dynamic
 
@@ -131,7 +134,8 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         if self.quant_format == CompressionFormat.int_quantized.value:
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8StaticTensor()
+                return CompressedTensorsW8A8StaticTensor(
+                    strategy=weight_quant.strategy)
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8DynamicToken(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
new file mode 100644
index 000000000..efed79ec7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
@@ -0,0 +1,84 @@
+from typing import Callable, List, Tuple, Union
+
+import torch
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class CompressedTensorsW8A8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str):
+        self.strategy = strategy
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        assert isinstance(shard_id, str)
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
+    def scales_shard_splitter(
+            self, param: torch.Tensor, loaded_weight: torch.Tensor,
+            shard_id: Union[str, int],
+            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        shard_id = self._shard_id_as_int(shard_id)
+        offset = sum(logical_widths[:shard_id])
+        size = logical_widths[shard_id]
+        # update loaded weight with copies for broadcast.
+        loaded_weight = loaded_weight.repeat(size)
+        return param[offset:offset + size], loaded_weight
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        is_tensor_partitioned = len(output_partition_sizes) != 1
+        weight_scale_dim = sum(output_partition_sizes) if (
+            is_tensor_partitioned
+            or self.strategy == QuantizationStrategy.CHANNEL) else 1
+
+        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            shape = (weight_scale_dim, 1)
+
+        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
+                                 requires_grad=False)
+
+        layer.register_parameter("weight_scale", weight_scale)
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.int8),
+                           requires_grad=False)
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(
+            weight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "weight_loader": weight_loader,
+                "logical_widths": output_partition_sizes
+            })
+
+        # Don't need a shard_splitter for channel-wise quantization
+        # Use the default loading method
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            set_weight_attrs(weight_scale, {
+                "output_dim": 0,
+            })
+        else:
+            set_weight_attrs(
+                weight_scale, {
+                    "logical_widths": output_partition_sizes,
+                    "shard_splitter": self.scales_shard_splitter,
+                })
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index 37610c9c2..5fc05b8e6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -1,42 +1,15 @@
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List
 
 import torch
-from torch.nn import Parameter
 
 from vllm import _custom_ops as custom_ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    QuantizationStrategy)
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import (  # noqa: E501
+    CompressedTensorsW8A8)
 
 __all__ = ["CompressedTensorsW8A8DynamicToken"]
 
 
-class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
-
-    def __init__(self, strategy: str):
-        self.strategy = strategy
-
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        assert isinstance(shard_id, str)
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    def scales_shard_splitter(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int],
-            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        shard_id = self._shard_id_as_int(shard_id)
-        offset = sum(logical_widths[:shard_id])
-        size = logical_widths[shard_id]
-        # update loaded weight with copies for broadcast.
-        loaded_weight = loaded_weight.repeat(size)
-        return param[offset:offset + size], loaded_weight
+class CompressedTensorsW8A8DynamicToken(CompressedTensorsW8A8):
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
@@ -44,54 +17,12 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
-        # When the scales have a single value, it is required that they be
-        # on the CPU for performance and CUDA Graphs compatibility. Please
-        # refer to the comment in
-        # CompressedTensorsW8A8StaticTensor::create_weights for further
-        # information.
-        is_tensor_partitioned = len(output_partition_sizes) != 1
-        # when doing channel-wise quantization, number of scales
-        # is equal to output_dim
-        weight_scale_dim = sum(output_partition_sizes) if (
-            is_tensor_partitioned
-            or self.strategy == QuantizationStrategy.CHANNEL) else 1
-
-        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            shape = (weight_scale_dim, 1)
-
-        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
-                                 requires_grad=False)
-
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.int8),
-                           requires_grad=False)
-
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(
-            weight, {
-                "input_dim": 1,
-                "output_dim": 0,
-                "weight_loader": weight_loader,
-                "logical_widths": output_partition_sizes
-            })
-
-        layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
-
-        # Don't need a shard_splitter for channel-wise quantization
-        # Use the default loading method
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            set_weight_attrs(weight_scale, {
-                "output_dim": 0,
-            })
-        else:
-            set_weight_attrs(
-                weight_scale, {
-                    "logical_widths": output_partition_sizes,
-                    "shard_splitter": self.scales_shard_splitter,
-                })
+        super().create_weights(
+            layer=layer,
+            output_partition_sizes=output_partition_sizes,
+            input_size_per_partition=input_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader)
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         weight = layer.weight
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
index 414e17a06..79f5358a3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -1,37 +1,17 @@
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List
 
 import torch
 from torch.nn import Parameter
 
 from vllm import _custom_ops as custom_ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import (  # noqa: E501
+    CompressedTensorsW8A8)
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW8A8StaticTensor"]
 
 
-class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
-
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        assert isinstance(shard_id, str)
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    def scales_shard_splitter(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int],
-            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        shard_id = self._shard_id_as_int(shard_id)
-        offset = sum(logical_widths[:shard_id])
-        size = logical_widths[shard_id]
-        # update loaded weight with copies for broadcast.
-        loaded_weight = loaded_weight.repeat(size)
-        return param[offset:offset + size], loaded_weight
+class CompressedTensorsW8A8StaticTensor(CompressedTensorsW8A8):
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
@@ -39,41 +19,21 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
-        is_tensor_partitioned = len(output_partition_sizes) != 1
-        weight_scale_dim = sum(
-            output_partition_sizes) if is_tensor_partitioned else 1
+        super().create_weights(
+            layer=layer,
+            output_partition_sizes=output_partition_sizes,
+            input_size_per_partition=input_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader)
 
         input_scale = Parameter(torch.empty(1, dtype=torch.float32),
                                 requires_grad=False)
 
-        weight_scale = Parameter(torch.empty(weight_scale_dim,
-                                             dtype=torch.float32),
-                                 requires_grad=False)
-
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.int8),
-                           requires_grad=False)
-
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            "weight_loader": weight_loader,
-            "input_dim": 1,
-            "output_dim": 0,
-        })
         layer.register_parameter("input_scale", input_scale)
         set_weight_attrs(input_scale, {
             "weight_loader": weight_loader,
             "ignore_warning": True,
         })
-        layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(
-            weight_scale, {
-                "weight_loader": weight_loader,
-                "shard_splitter": self.scales_shard_splitter,
-                "logical_widths": output_partition_sizes,
-                "ignore_warning": True,
-            })
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         weight = layer.weight
-- 
GitLab


From 949e49a6857080e36ecd62f6e193754290c7c43c Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 19 Jun 2024 16:30:03 -0700
Subject: [PATCH 107/376] [ci] Limit num gpus if specified for A100 (#5694)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml   | 1 +
 .buildkite/test-template-aws.j2 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b1602dd94..95cd5b198 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -184,6 +184,7 @@ steps:
 
 - label: Distributed Tests (A100)
   gpu: a100
+  num_gpus: 4
   commands: 
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 08146bf44..fb34b787e 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -75,7 +75,7 @@ steps:
             - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
             resources:
               limits:
-                nvidia.com/gpu: 8
+                nvidia.com/gpu: {{ step.num_gpus or 1 }}
             volumeMounts:
             - name: devshm
               mountPath: /dev/shm
-- 
GitLab


From 3730a1c832bca5ca8128aec3c7659304895edf2e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 20 Jun 2024 10:09:21 +0800
Subject: [PATCH 108/376] [Misc] Improve conftest (#5681)

---
 tests/conftest.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index f37c9883f..5bbfd87f0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -365,7 +365,7 @@ class HfRunner:
         cleanup()
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def hf_runner():
     return HfRunner
 
@@ -385,6 +385,7 @@ class VllmRunner:
         block_size: int = 16,
         enable_chunked_prefill: bool = False,
         swap_space: int = 4,
+        enforce_eager: bool = False,
         **kwargs,
     ) -> None:
         self.model = LLM(
@@ -393,6 +394,7 @@ class VllmRunner:
             trust_remote_code=True,
             dtype=dtype,
             swap_space=swap_space,
+            enforce_eager=enforce_eager,
             disable_log_stats=disable_log_stats,
             tensor_parallel_size=tensor_parallel_size,
             max_model_len=max_model_len,
-- 
GitLab


From 1b2eaac3165dc78d4ef51231722735ca9cf37304 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 19 Jun 2024 23:10:47 -0700
Subject: [PATCH 109/376] [Bugfix][Doc] FIx Duplicate Explicit Target Name
 Errors (#5703)

---
 docs/source/dev/dockerfile/dockerfile.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/dev/dockerfile/dockerfile.rst
index 40ba818eb..9c17c27aa 100644
--- a/docs/source/dev/dockerfile/dockerfile.rst
+++ b/docs/source/dev/dockerfile/dockerfile.rst
@@ -1,8 +1,8 @@
 Dockerfile
 ====================
 
-See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct 
-the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`_.
+See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`__ for the main Dockerfile to construct 
+the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`__.
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
-- 
GitLab


From 111af1fa2c4fdb2d83b466935a327b1a5009874a Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 20 Jun 2024 12:07:08 +0530
Subject: [PATCH 110/376] [Kernel] Update Cutlass int8 kernel configs for SM90
 (#5514)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 165 +++++++++++++++---
 1 file changed, 143 insertions(+), 22 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index f1a2b73ff..8f2aa9425 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -234,15 +234,15 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
 }
 
 template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue, int32_t M>
-struct sm90_fp8_config {
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
-
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule>;
@@ -250,14 +250,14 @@ struct sm90_fp8_config {
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config<InType, OutType, Epilogue, 128> {
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_64, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
-
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule>;
@@ -265,7 +265,8 @@ struct sm90_fp8_config<InType, OutType, Epilogue, 128> {
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config<InType, OutType, Epilogue, 64> {
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
@@ -278,6 +279,78 @@ struct sm90_fp8_config<InType, OutType, Epilogue, 64> {
                       KernelSchedule, EpilogueSchedule>;
 };
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
 }  // namespace
 
 template <typename InType, typename OutType,
@@ -291,11 +364,12 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
   using Cutlass3xGemmDefault =
-      typename sm90_fp8_config<InType, OutType, Epilogue, 0>::Cutlass3xGemm;
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM64 =
-      typename sm90_fp8_config<InType, OutType, Epilogue, 64>::Cutlass3xGemm;
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
-      typename sm90_fp8_config<InType, OutType, Epilogue, 128>::Cutlass3xGemm;
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
@@ -316,6 +390,61 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_int8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
 void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -326,22 +455,14 @@ void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
-    using TileShape = Shape<_128, _128, _128>;
-    using ClusterShape = Shape<_1, _2, _1>;
-    using KernelSchedule =
-        typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-    using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_3x_gemm<
-          int8_t, cutlass::bfloat16_t, ScaledEpilogue, TileShape, ClusterShape,
-          KernelSchedule, EpilogueSchedule>>(out, a, b, a_scales, b_scales);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             ScaledEpilogue>(
+          out, a, b, a_scales, b_scales);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-
-      return cutlass_gemm_caller<
-          cutlass_3x_gemm<int8_t, cutlass::half_t, ScaledEpilogue, TileShape,
-                          ClusterShape, KernelSchedule, EpilogueSchedule>>(
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t,
+                                             ScaledEpilogue>(
           out, a, b, a_scales, b_scales);
     }
   } else {
-- 
GitLab


From ad137cd1112ab9b17ac36fc123fc7806a1d7473d Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 20 Jun 2024 04:52:09 -0700
Subject: [PATCH 111/376] [Model] Port over CLIPVisionModel for VLMs (#5591)

---
 csrc/activation_kernels.cu               |  12 ++
 csrc/ops.h                               |   2 +
 csrc/torch_bindings.cpp                  |   4 +
 vllm/_custom_ops.py                      |   4 +
 vllm/model_executor/layers/activation.py |  16 ++
 vllm/model_executor/models/clip.py       | 203 +++++++++++++++++++++++
 vllm/model_executor/models/llava.py      |  17 +-
 vllm/model_executor/models/llava_next.py |  19 ++-
 vllm/model_executor/models/phi3v.py      |  13 +-
 9 files changed, 269 insertions(+), 21 deletions(-)
 create mode 100644 vllm/model_executor/models/clip.py

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 86ac2e75e..5ed1dc3b8 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -135,6 +135,12 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
   return ((T)0.5) * x * (((T)1.0) + t);
 }
 
+template <typename T>
+__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
+  // x * sigmoid(1.702 * x)
+  return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
+}
+
 }  // namespace vllm
 
 void gelu_new(torch::Tensor& out,    // [..., d]
@@ -148,3 +154,9 @@ void gelu_fast(torch::Tensor& out,    // [..., d]
 {
   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
 }
+
+void gelu_quick(torch::Tensor& out,    // [..., d]
+                torch::Tensor& input)  // [..., d]
+{
+  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
+}
diff --git a/csrc/ops.h b/csrc/ops.h
index 9e2e977fa..ba92cc537 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -49,6 +49,8 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 
+void gelu_quick(torch::Tensor& out, torch::Tensor& input);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                         const torch::Tensor& codebooks,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 867bf4389..953f2eb4d 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -68,6 +68,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
 
+  // Quick GELU implementation.
+  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
+
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ab2a67950..a053a3aa2 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -66,6 +66,10 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
     torch.ops._C.gelu_new(out, x)
 
 
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    torch.ops._C.gelu_quick(out, x)
+
+
 # page attention ops
 def paged_attention_v1(
     out: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index eb0606948..80cad15b4 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -141,6 +141,21 @@ class FastGELU(CustomOp):
         return out
 
 
+class QuickGELU(CustomOp):
+
+    # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        return x * torch.sigmoid(1.702 * x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
+
+
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
 
@@ -189,6 +204,7 @@ _ACTIVATION_REGISTRY = {
     "gelu_new": NewGELU(),
     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
     "relu": nn.ReLU(),
+    "quick_gelu": QuickGELU(),
 }
 
 
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
new file mode 100644
index 000000000..aa4e87228
--- /dev/null
+++ b/vllm/model_executor/models/clip.py
@@ -0,0 +1,203 @@
+"""Minimal implementation of CLIPVisionModel intended to be only used 
+within a vision language model."""
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionConfig
+from transformers.models.clip.modeling_clip import CLIPAttention
+
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+def get_clip_num_patches(image_size: int, patch_size: int) -> int:
+    assert image_size % patch_size == 0
+    return (image_size // patch_size)**2
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
+class CLIPVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = get_clip_num_patches(self.image_size,
+                                                self.patch_size)
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+        self.register_buffer("position_ids",
+                             torch.arange(self.num_positions).expand((1, -1)),
+                             persistent=False)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+class CLIPMLP(nn.Module):
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class CLIPEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config, quant_config=quant_config)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self 
+    attention layers. Each layer is a [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            CLIPEncoderLayer(config=config, quant_config=quant_config)
+            for _ in range(config.num_hidden_layers)
+        ])
+
+    def forward(self,
+                inputs_embeds: torch.Tensor,
+                vision_feature_layer: int = -1):
+
+        # Encoder forward pass only up to the required layer
+        num_layer = len(self.layers) + vision_feature_layer + 1
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers[:num_layer]:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class CLIPVisionTransformer(nn.Module):
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+
+        # NOTE: This typo of "layrnorm" is not fixed on purpose to match
+        # the original transformers code and name of the model weights.
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config=config, quant_config=quant_config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        vision_feature_layer: int = -1,
+    ) -> torch.Tensor:
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.encoder(inputs_embeds=hidden_states,
+                                     vision_feature_layer=vision_feature_layer)
+
+        return hidden_states
+
+
+class CLIPVisionModel(nn.Module):
+
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self,
+                 config: CLIPVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.vision_model = CLIPVisionTransformer(config=config,
+                                                  quant_config=quant_config)
+
+    def forward(self,
+                pixel_values: Optional[torch.Tensor] = None,
+                vision_feature_layer: int = -1):
+
+        return self.vision_model(pixel_values=pixel_values,
+                                 vision_feature_layer=vision_feature_layer)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 39355b9d3..8e36c54b1 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -2,9 +2,7 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
-# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
-# transformers' impl.
-from transformers import CLIPVisionModel, LlavaConfig
+from transformers import LlavaConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
@@ -15,6 +13,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -189,12 +188,11 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
     def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
                                   pixel_values: torch.Tensor) -> torch.Tensor:
-        # TODO(xwjiang): Maybe port minimal CLIPVisionModel over.
-        image_outputs = vision_tower(pixel_values.to(vision_tower.device),
-                                     output_hidden_states=True)
 
-        image_features = image_outputs.hidden_states[
-            self.config.vision_feature_layer]
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values.to(vision_tower.device),
+                                      self.config.vision_feature_layer)
 
         return self._select_image_features(
             image_features,
@@ -317,6 +315,9 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            # post_layernorm is not needed in CLIPVisionModel
+            if "vision_model.post_layernorm" in name:
+                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 0ab9afea9..c1158c933 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -4,9 +4,7 @@ from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict,
 import torch
 import torch.nn as nn
 from PIL import Image
-# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
-# transformers' impl.
-from transformers import CLIPVisionModel, LlavaNextConfig
+from transformers import LlavaNextConfig
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
@@ -20,6 +18,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
@@ -121,7 +120,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
 
         if self.vision_language_config.image_input_type == (
                 VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
-            self.vision_tower = CLIPVisionModel(config.vision_config)
+            self.vision_tower = CLIPVisionModel(config=config.vision_config)
         else:
             raise TypeError("Image features are not supported by LLaVA-NeXT")
 
@@ -219,12 +218,11 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
 
     def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
                                   pixel_values: torch.Tensor) -> torch.Tensor:
-        # TODO(xwjiang): Maybe port minimal CLIPVisionModel over.
-        image_outputs = vision_tower(pixel_values.to(vision_tower.device),
-                                     output_hidden_states=True)
 
-        image_features = image_outputs.hidden_states[
-            self.config.vision_feature_layer]
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values.to(vision_tower.device),
+                                      self.config.vision_feature_layer)
 
         return self._select_image_features(
             image_features,
@@ -430,6 +428,9 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            # post_layernorm is not needed in CLIPVisionModel
+            if "vision_model.post_layernorm" in name:
+                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 35f3b894f..fa20a7c59 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -17,7 +17,7 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
-from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
+from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
@@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -70,9 +71,10 @@ class Phi3ImageEmbeddingBase(nn.Module):
         LAYER_IDX = self.layer_idx
         TYPE_FEATURE = self.type_feature
 
-        img_processor_output = self.img_processor(img_embeds,
-                                                  output_hidden_states=True)
-        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the img_processor
+        img_feature = self.img_processor(img_embeds,
+                                         vision_feature_layer=LAYER_IDX)
 
         if TYPE_FEATURE == "patch":
             patch_feature = img_feature[:, 1:]
@@ -352,6 +354,9 @@ class Phi3VForCausalLM(VisionLanguageModelBase):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            # post_layernorm is not needed in CLIPVisionModel
+            if "vision_model.post_layernorm" in name:
+                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
-- 
GitLab


From a7dcc62086ea751b46b4821c2811cf8ac83711bf Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 20 Jun 2024 19:03:21 +0530
Subject: [PATCH 112/376] [Kernel] Update Cutlass int8 kernel configs for SM80
 (#5275)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 csrc/quantization/cutlass_w8a8/common.hpp     |   7 +
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 127 ++++++++++++++++--
 .../cutlass_w8a8/scaled_mm_c3x.cu             |   5 -
 3 files changed, 123 insertions(+), 16 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
index 999b7b251..23d0587bb 100644
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ b/csrc/quantization/cutlass_w8a8/common.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include <climits>
 
 /**
  * Helper function for checking CUTLASS errors
@@ -10,3 +11,9 @@
     TORCH_CHECK(status == cutlass::Status::kSuccess, \
                 cutlassGetStatusString(status))      \
   }
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 7651268dc..740b9fb64 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -250,8 +250,120 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_default {
+  // This config is used in 2 cases,
+  //  - M in (128, inf)
+  //  - M in (64, 128] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M64 {
+  // This config is used in 2 cases,
+  // - M in (32, 64]
+  // - M in (64, 128] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M32 {
+  // M in (16, 32]
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M16 {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
 }  // namespace
 
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128BigN =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128SmallN =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM64 =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM16 =
+      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return cutlass_gemm_caller<Cutlass2xGemmM16>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return cutlass_gemm_caller<Cutlass2xGemmM32>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return cutlass_gemm_caller<Cutlass2xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    uint32_t const n = out.size(1);
+    bool const small_n = n < 8192;
+    if (small_n) {
+      return cutlass_gemm_caller<Cutlass2xGemmM128SmallN>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_gemm_caller<Cutlass2xGemmM128BigN>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    // M in (128, inf)
+    return cutlass_gemm_caller<Cutlass2xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
 void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -288,20 +400,13 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::bfloat16_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
+                                      ScaledEpilogue>(out, a, b, a_scales,
+                                                      b_scales);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::half_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 8f2aa9425..cfa8f80f7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -44,11 +44,6 @@ using namespace cute;
 
 namespace {
 
-uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
 // A wrapper for the GEMM kernel that is used to guard against compilation on
 // architectures that will never use the kernel. The purpose of this is to
 // reduce the size of the compiled binary.
-- 
GitLab


From 3f3b6b21500bce2061cae33706bd47c8b6663771 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 20 Jun 2024 14:36:10 -0400
Subject: [PATCH 113/376] [Bugfix] Fix the CUDA version check for FP8 support
 in the CUTLASS kernels (#5715)

---
 csrc/ops.h                                       |  2 ++
 .../quantization/cutlass_w8a8/scaled_mm_entry.cu | 16 ++++++++++++++++
 csrc/torch_bindings.cpp                          |  6 ++++++
 vllm/_custom_ops.py                              |  4 ++++
 vllm/model_executor/layers/quantization/fp8.py   | 15 ++-------------
 5 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index ba92cc537..6f0a7143c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,6 +92,8 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
+
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 687f8efd8..f4e582d78 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -25,6 +25,22 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b_scales);
 #endif
 
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
+  // CUTLASS FP8 kernels need at least
+  //   CUDA 12.0 on SM90 systems (Hopper)
+  //   CUDA 12.4 on SM89 systems (Lovelace)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 89) {
+    return CUDA_VERSION >= 12040;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales) {
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 953f2eb4d..227b69d79 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -144,6 +144,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                  Tensor b, Tensor a_scales,"
       "                  Tensor b_scales) -> ()");
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
+
+  // Check if cutlass scaled_mm is supported for CUDA devices of the given
+  // capability
+  ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+  ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
+           &cutlass_scaled_mm_supports_fp8);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a053a3aa2..e050c1172 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -216,6 +216,10 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # cutlass
+def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+
+
 def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
                       out_dtype: Type[torch.dtype]) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e89fd6581..bbf3cde54 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -20,19 +20,8 @@ logger = init_logger(__name__)
 def cutlass_fp8_supported() -> bool:
     capability = torch.cuda.get_device_capability()
     capability = capability[0] * 10 + capability[1]
-    major, minor = torch.version.cuda.split(".")
-    version = int(major) * 10 + int(minor)
-
-    # CUTLASS FP8 kernels need at least
-    #   CUDA 12.0 on SM90 systems (Hopper)
-    #   CUDA 12.4 on SM89 systems (Lovelace)
-    gpu_is_supported = False
-    if capability >= 90:
-        gpu_is_supported = version > 120
-    elif capability >= 89:
-        gpu_is_supported = version > 124
-
-    return gpu_is_supported
+
+    return ops.cutlass_scaled_mm_supports_fp8(capability)
 
 
 class Fp8Config(QuantizationConfig):
-- 
GitLab


From 8065a7e220cca1dd53107da85b6f3932ac9e25e8 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 20 Jun 2024 19:00:13 -0400
Subject: [PATCH 114/376] [Frontend] Add FlexibleArgumentParser to support both
 underscore and dash in names (#5718)

---
 benchmarks/benchmark_latency.py               |  3 ++-
 benchmarks/benchmark_prefix_caching.py        |  4 ++--
 benchmarks/benchmark_serving.py               |  7 ++++++-
 benchmarks/benchmark_throughput.py            |  3 ++-
 .../cutlass_benchmarks/w8a8_benchmarks.py     |  3 ++-
 benchmarks/kernels/benchmark_aqlm.py          |  4 ++--
 benchmarks/kernels/benchmark_marlin.py        |  4 ++--
 benchmarks/kernels/benchmark_moe.py           |  3 ++-
 .../kernels/benchmark_paged_attention.py      |  6 +++---
 benchmarks/kernels/benchmark_rope.py          |  4 ++--
 benchmarks/overheads/benchmark_hashing.py     |  4 ++--
 examples/aqlm_example.py                      |  5 ++---
 examples/llm_engine_example.py                |  3 ++-
 examples/save_sharded_state.py                |  4 ++--
 examples/tensorize_vllm_model.py              |  3 ++-
 tests/async_engine/api_server_async_engine.py |  4 ++--
 vllm/engine/arg_utils.py                      | 17 ++++++++---------
 vllm/entrypoints/api_server.py                |  5 ++---
 vllm/entrypoints/openai/cli_args.py           |  3 ++-
 vllm/entrypoints/openai/run_batch.py          |  5 ++---
 .../model_executor/model_loader/tensorizer.py |  4 ++--
 vllm/utils.py                                 | 19 +++++++++++++++++++
 22 files changed, 72 insertions(+), 45 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index e9d1048c8..a4cf0632b 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -13,6 +13,7 @@ from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import FlexibleArgumentParser
 
 
 def main(args: argparse.Namespace):
@@ -120,7 +121,7 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Benchmark the latency of processing a single batch of '
         'requests till completion.')
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 089966986..395107a5e 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,7 +1,7 @@
-import argparse
 import time
 
 from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
 
 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
 
@@ -44,7 +44,7 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Benchmark the performance with or without automatic '
         'prefix caching.')
     parser.add_argument('--model',
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index eef03e7d8..42867fc40 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -44,6 +44,11 @@ try:
 except ImportError:
     from backend_request_func import get_tokenizer
 
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
 
 @dataclass
 class BenchmarkMetrics:
@@ -511,7 +516,7 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark the online serving throughput.")
     parser.add_argument(
         "--backend",
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index ed65002bc..2c6beb4e8 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -12,6 +12,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer,
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import FlexibleArgumentParser
 
 
 def sample_requests(
@@ -261,7 +262,7 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
     parser.add_argument("--backend",
                         type=str,
                         choices=["vllm", "hf", "mii"],
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 5cc0fbbd4..377f8683c 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -11,6 +11,7 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -293,7 +294,7 @@ if __name__ == '__main__':
             return torch.float8_e4m3fn
         raise ValueError("unsupported dtype")
 
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="""
 Benchmark Cutlass GEMM.
 
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index ac6a9f297..601c4ea43 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,4 +1,3 @@
-import argparse
 import os
 import sys
 from typing import Optional
@@ -10,6 +9,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.aqlm import (
     dequantize_weight, generic_dequantize_gemm, get_int_dtype,
     optimized_dequantize_gemm)
+from vllm.utils import FlexibleArgumentParser
 
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
@@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
 
 def main():
 
-    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
 
     # Add arguments
     parser.add_argument("--nbooks",
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 96f01967b..261f58296 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,4 +1,3 @@
-import argparse
 from typing import List
 
 import torch
@@ -16,6 +15,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     MarlinWorkspace, marlin_24_quantize, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     gptq_pack, quantize_weights, sort_weights)
+from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -211,7 +211,7 @@ def main(args):
 #   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
 #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark Marlin across specified models/shapes/batches")
     parser.add_argument(
         "--models",
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 62347aaf8..e00696d6d 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -10,6 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
 from transformers import AutoConfig
 
 from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.utils import FlexibleArgumentParser
 
 
 class BenchmarkConfig(TypedDict):
@@ -315,7 +316,7 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
     parser.add_argument("--model",
                         type=str,
                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 687e2369b..16de60477 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,4 +1,3 @@
-import argparse
 import random
 import time
 from typing import List, Optional
@@ -6,7 +5,8 @@ from typing import List, Optional
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
+                        create_kv_caches_with_random)
 
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@@ -161,7 +161,7 @@ def main(
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark the paged attention kernel.")
     parser.add_argument("--version",
                         type=str,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index a53c6c77a..78736c7a7 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,4 +1,3 @@
-import argparse
 from itertools import accumulate
 from typing import List, Optional
 
@@ -7,6 +6,7 @@ import torch
 
 from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
                                                          get_rope)
+from vllm.utils import FlexibleArgumentParser
 
 
 def benchmark_rope_kernels_multi_lora(
@@ -86,7 +86,7 @@ def benchmark_rope_kernels_multi_lora(
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark the rotary embedding kernels.")
     parser.add_argument("--is-neox-style", type=bool, default=True)
     parser.add_argument("--batch-size", type=int, default=16)
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index c846e47de..203699e9a 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -1,8 +1,8 @@
-import argparse
 import cProfile
 import pstats
 
 from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
 
 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"
@@ -47,7 +47,7 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Benchmark the performance of hashing function in'
         'automatic prefix caching.')
     parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
index 3a63003ab..40f9a21ec 100644
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
@@ -1,11 +1,10 @@
-import argparse
-
 from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
 
 
 def main():
 
-    parser = argparse.ArgumentParser(description='AQLM examples')
+    parser = FlexibleArgumentParser(description='AQLM examples')
 
     parser.add_argument('--model',
                         '-m',
diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
index a81c4b3e3..ca41f32b1 100644
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@@ -2,6 +2,7 @@ import argparse
 from typing import List, Tuple
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.utils import FlexibleArgumentParser
 
 
 def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
@@ -55,7 +56,7 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Demo on using the LLMEngine class directly')
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/examples/save_sharded_state.py b/examples/save_sharded_state.py
index c595d98ba..4207f8922 100644
--- a/examples/save_sharded_state.py
+++ b/examples/save_sharded_state.py
@@ -20,15 +20,15 @@ llm = LLM(
     tensor_parallel_size=8,
 )
 """
-import argparse
 import dataclasses
 import os
 import shutil
 from pathlib import Path
 
 from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
 
-parser = argparse.ArgumentParser()
+parser = FlexibleArgumentParser()
 EngineArgs.add_cli_args(parser)
 parser.add_argument("--output",
                     "-o",
diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index f9ed5fe08..dd77a4ad0 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -9,6 +9,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
                                                          TensorizerConfig,
                                                          tensorize_vllm_model)
+from vllm.utils import FlexibleArgumentParser
 
 # yapf conflicts with isort for this docstring
 # yapf: disable
@@ -96,7 +97,7 @@ deserialization in this example script, although `--tensorizer-uri` and
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="An example script that can be used to serialize and "
         "deserialize vLLM models. These models "
         "can be loaded using tensorizer directly to the GPU "
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index 1be76fdc8..495a123c3 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,5 +1,4 @@
 """vllm.entrypoints.api_server with some extra logging for testing."""
-import argparse
 from typing import Any, Dict
 
 import uvicorn
@@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response
 import vllm.entrypoints.api_server
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import FlexibleArgumentParser
 
 app = vllm.entrypoints.api_server.app
 
@@ -33,7 +33,7 @@ def stats() -> Response:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser = AsyncEngineArgs.add_cli_args(parser)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7f760c277..ef3161242 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -11,7 +11,7 @@ from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          SpeculativeConfig, TokenizerPoolConfig,
                          VisionLanguageConfig)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import str_to_int_tuple
+from vllm.utils import FlexibleArgumentParser, str_to_int_tuple
 
 
 def nullable_str(val: str):
@@ -110,7 +110,7 @@ class EngineArgs:
 
     @staticmethod
     def add_cli_args_for_vlm(
-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+            parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--image-input-type',
                             type=nullable_str,
                             default=None,
@@ -156,8 +156,7 @@ class EngineArgs:
         return parser
 
     @staticmethod
-    def add_cli_args(
-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
 
         # Model arguments
@@ -800,8 +799,8 @@ class AsyncEngineArgs(EngineArgs):
     max_log_len: Optional[int] = None
 
     @staticmethod
-    def add_cli_args(parser: argparse.ArgumentParser,
-                     async_args_only: bool = False) -> argparse.ArgumentParser:
+    def add_cli_args(parser: FlexibleArgumentParser,
+                     async_args_only: bool = False) -> FlexibleArgumentParser:
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
         parser.add_argument('--engine-use-ray',
@@ -822,13 +821,13 @@ class AsyncEngineArgs(EngineArgs):
 
 # These functions are used by sphinx to build the documentation
 def _engine_args_parser():
-    return EngineArgs.add_cli_args(argparse.ArgumentParser())
+    return EngineArgs.add_cli_args(FlexibleArgumentParser())
 
 
 def _async_engine_args_parser():
-    return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
+    return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
                                         async_args_only=True)
 
 
 def _vlm_engine_args_parser():
-    return EngineArgs.add_cli_args_for_vlm(argparse.ArgumentParser())
+    return EngineArgs.add_cli_args_for_vlm(FlexibleArgumentParser())
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 075de0b4e..feb904c5a 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -6,7 +6,6 @@ We are also not going to accept PRs modifying this file, please
 change `vllm/entrypoints/openai/api_server.py` instead.
 """
 
-import argparse
 import json
 import ssl
 from typing import AsyncGenerator
@@ -19,7 +18,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import random_uuid
+from vllm.utils import FlexibleArgumentParser, random_uuid
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 app = FastAPI()
@@ -80,7 +79,7 @@ async def generate(request: Request) -> Response:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--ssl-keyfile", type=str, default=None)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 4c0cb1e4f..59ad73bf0 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -10,6 +10,7 @@ import ssl
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.utils import FlexibleArgumentParser
 
 
 class LoRAParserAction(argparse.Action):
@@ -23,7 +24,7 @@ class LoRAParserAction(argparse.Action):
 
 
 def make_arg_parser():
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser.add_argument("--host",
                         type=nullable_str,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 2f1870187..488ac8971 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,4 +1,3 @@
-import argparse
 import asyncio
 import sys
 from io import StringIO
@@ -16,14 +15,14 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput,
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import random_uuid
+from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible batch runner.")
     parser.add_argument(
         "-i",
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index d79fedaea..b009ad8c8 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -21,6 +21,7 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.utils import FlexibleArgumentParser
 
 tensorizer_error_msg = None
 
@@ -177,8 +178,7 @@ class TensorizerArgs:
                 self.deserializer_params['encryption'] = decryption_params
 
     @staticmethod
-    def add_cli_args(
-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Tensorizer CLI arguments"""
 
         # Tensorizer options arg group
diff --git a/vllm/utils.py b/vllm/utils.py
index ffe921e65..27a7b1042 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,3 +1,4 @@
+import argparse
 import asyncio
 import datetime
 import enum
@@ -775,3 +776,21 @@ def run_once(f):
 
     wrapper.has_run = False  # type: ignore[attr-defined]
     return wrapper
+
+
+class FlexibleArgumentParser(argparse.ArgumentParser):
+    """ArgumentParser that allows both underscore and dash in names."""
+
+    def parse_args(self, args=None, namespace=None):
+        if args is None:
+            args = sys.argv[1:]
+
+        # Convert underscores to dashes and vice versa in argument names
+        processed_args = []
+        for arg in args:
+            if arg.startswith('--'):
+                processed_args.append('--' + arg[len('--'):].replace('_', '-'))
+            else:
+                processed_args.append(arg)
+
+        return super().parse_args(processed_args, namespace)
-- 
GitLab


From 6c5b7af1525a2013d7b1806dd6c0c9a53404be6d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 20 Jun 2024 17:06:34 -0700
Subject: [PATCH 115/376] [distributed][misc] use fork by default for mp
 (#5669)

---
 .buildkite/test-pipeline.yaml                 |  9 ++++++
 .../custom_all_reduce_utils.py                | 28 ++++++++++++++++++-
 vllm/envs.py                                  |  4 +--
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 95cd5b198..5e92ba3c2 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -37,6 +37,9 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -55,6 +58,9 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
@@ -145,6 +151,9 @@ steps:
   num_gpus: 4
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index e0641a54c..d3e41fa71 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,6 +1,9 @@
 import ctypes
 import json
 import os
+import pickle
+import subprocess
+import sys
 from itertools import product
 from typing import Dict, List, Optional, Sequence
 
@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
         ids = list(range(num_dev))
         # batch of all pairs of GPUs
         batch_src, batch_tgt = zip(*list(product(ids, ids)))
-        result = can_actually_p2p(batch_src, batch_tgt)
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
+        # because the caller might not have `if __name__ == "__main__":`,
+        # in that case we cannot use spawn method in multiprocessing.
+        # However, `can_actually_p2p` requires spawn method.
+        # The fix is, we use `subprocess` to call the function,
+        # where we have `if __name__ == "__main__":` in this file.
+        input_bytes = pickle.dumps((batch_src, batch_tgt))
+        returned = subprocess.run([sys.executable, __file__],
+                                  input=input_bytes,
+                                  capture_output=True)
+        # check if the subprocess is successful
+        try:
+            returned.check_returncode()
+        except Exception as e:
+            # wrap raised exception to provide more information
+            raise RuntimeError(
+                f"Error happened when batch testing "
+                f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
+        result = pickle.loads(returned.stdout)
         for _i, _j, r in zip(batch_src, batch_tgt, result):
             cache[f"{_i}->{_j}"] = r
         with open(path, "w") as f:
@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
 
 
 __all__ = ["gpu_p2p_access_check"]
+
+if __name__ == "__main__":
+    batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read())
+    result = can_actually_p2p(batch_src, batch_tgt)
+    sys.stdout.buffer.write(pickle.dumps(result))
diff --git a/vllm/envs.py b/vllm/envs.py
index f03b69f4b..ae2fcd082 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -29,7 +29,7 @@ if TYPE_CHECKING:
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
     VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
@@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
-    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
 
     # Timeout for fetching images when serving multimodal models
     # Default is 5 seconds
-- 
GitLab


From b12518d3cf4326dfcd10a09780913b86c19fcf1a Mon Sep 17 00:00:00 2001
From: Joshua Rosenkranz <joshua.rosenkranz@gmail.com>
Date: Thu, 20 Jun 2024 20:23:12 -0400
Subject: [PATCH 116/376] [Model] MLPSpeculator speculative decoding support
 (#4947)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>

Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Davis Wertheimer <Davis.Wertheimer@ibm.com>
---
 examples/offline_inference_mlpspeculator.py   |  59 ++++++++
 tests/spec_decode/test_spec_decode_worker.py  |   8 +-
 tests/spec_decode/test_utils.py               |   4 +-
 vllm/config.py                                |  54 +++++--
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/mlp_speculator.py  | 143 ++++++++++++++++++
 vllm/sequence.py                              |  46 ++++++
 vllm/spec_decode/batch_expansion.py           |   6 +-
 vllm/spec_decode/interfaces.py                |   4 +
 vllm/spec_decode/mlp_speculator_worker.py     |  87 +++++++++++
 vllm/spec_decode/spec_decode_worker.py        |  42 ++++-
 vllm/spec_decode/top1_proposer.py             |   4 +
 vllm/spec_decode/util.py                      |   8 -
 vllm/transformers_utils/config.py             |  18 ++-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 .../configs/mlp_speculator.py                 |  50 ++++++
 vllm/worker/model_runner.py                   |  18 ++-
 vllm/worker/worker.py                         |   9 ++
 18 files changed, 523 insertions(+), 40 deletions(-)
 create mode 100644 examples/offline_inference_mlpspeculator.py
 create mode 100644 vllm/model_executor/models/mlp_speculator.py
 create mode 100644 vllm/spec_decode/mlp_speculator_worker.py
 create mode 100644 vllm/transformers_utils/configs/mlp_speculator.py

diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py
new file mode 100644
index 000000000..5448ec1f6
--- /dev/null
+++ b/examples/offline_inference_mlpspeculator.py
@@ -0,0 +1,59 @@
+import gc
+import time
+from typing import List
+
+from vllm import LLM, SamplingParams
+
+
+def time_generation(llm: LLM, prompts: List[str],
+                    sampling_params: SamplingParams):
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    # Warmup first
+    llm.generate(prompts, sampling_params)
+    llm.generate(prompts, sampling_params)
+    start = time.time()
+    outputs = llm.generate(prompts, sampling_params)
+    end = time.time()
+    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    # Print the outputs.
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+
+    template = (
+        "Below is an instruction that describes a task. Write a response "
+        "that appropriately completes the request.\n\n### Instruction:\n{}"
+        "\n\n### Response:\n")
+
+    # Sample prompts.
+    prompts = [
+        "Write about the president of the United States.",
+    ]
+    prompts = [template.format(prompt) for prompt in prompts]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
+
+    # Create an LLM without spec decoding
+    llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
+
+    print("Without speculation")
+    time_generation(llm, prompts, sampling_params)
+
+    del llm
+    gc.collect()
+
+    # Create an LLM with spec decoding
+    llm = LLM(
+        model="meta-llama/Llama-2-13b-chat-hf",
+        speculative_model="ibm-fms/llama-13b-accelerator",
+        # These are currently required for MLPSpeculator decoding
+        use_v2_block_manager=True,
+        enforce_eager=True,
+    )
+
+    print("With speculation")
+    time_generation(llm, prompts, sampling_params)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index afaeffc96..a20c793c9 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -456,7 +456,9 @@ def test_k_equals_zero(k: int, batch_size: int):
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+    sampler_output = MagicMock(spec=SamplerOutput)
+    sampler_output.hidden_states = None
+    target_worker.execute_model.return_value = [sampler_output]
 
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
@@ -497,7 +499,9 @@ def test_empty_input_batch(k: int, batch_size: int):
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+    sampler_output = MagicMock(spec=SamplerOutput)
+    sampler_output.hidden_states = None
+    target_worker.execute_model.return_value = [sampler_output]
 
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 6b6f35a1a..bccbf9a6a 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -2,8 +2,8 @@ from unittest.mock import MagicMock
 
 import pytest
 
-from vllm.sequence import SequenceGroupMetadata
-from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len
+from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids
+from vllm.spec_decode.util import split_batch_by_proposal_len
 
 
 def test_get_all_seq_ids():
diff --git a/vllm/config.py b/vllm/config.py
index 5de00d7d3..8d004902f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -230,7 +230,8 @@ class ModelConfig:
         self,
         parallel_config: "ParallelConfig",
     ) -> None:
-        total_num_attention_heads = self.hf_text_config.num_attention_heads
+        total_num_attention_heads = getattr(self.hf_text_config,
+                                            "num_attention_heads", 0)
         tensor_parallel_size = parallel_config.tensor_parallel_size
         if total_num_attention_heads % tensor_parallel_size != 0:
             raise ValueError(
@@ -238,7 +239,8 @@ class ModelConfig:
                 " must be divisible by tensor parallel size "
                 f"({tensor_parallel_size}).")
 
-        total_num_hidden_layers = self.hf_text_config.num_hidden_layers
+        total_num_hidden_layers = getattr(self.hf_text_config,
+                                          "num_hidden_layers", 0)
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
         if total_num_hidden_layers % pipeline_parallel_size != 0:
             raise ValueError(
@@ -341,8 +343,8 @@ class ModelConfig:
 
     def get_num_attention_heads(self,
                                 parallel_config: "ParallelConfig") -> int:
-        return self.hf_text_config.num_attention_heads // \
-            parallel_config.tensor_parallel_size
+        num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
+        return num_heads // parallel_config.tensor_parallel_size
 
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
         total_num_hidden_layers = self.hf_text_config.num_hidden_layers
@@ -818,7 +820,8 @@ class SpeculativeConfig:
             speculative_model (Optional[str]): The name of the speculative
                 model, if provided.
             num_speculative_tokens (Optional[int]): The number of speculative
-                tokens, if provided.
+                tokens, if provided. Will default to the number in the draft
+                model config if present, otherwise is required.
             speculative_max_model_len (Optional[int]): The maximum model len of
                 the speculative model. Used when testing the ability to skip
                 speculation for some sequences.
@@ -841,24 +844,18 @@ class SpeculativeConfig:
                 the necessary conditions are met, else None.
         """
 
-        if speculative_model is None and num_speculative_tokens is None:
+        if speculative_model is None:
+            if num_speculative_tokens is not None:
+                raise ValueError("num_speculative_tokens was provided without "
+                                 "speculative_model.")
             return None
 
-        if speculative_model is not None and num_speculative_tokens is None:
-            raise ValueError(
-                "Expected both speculative_model and "
-                "num_speculative_tokens to be provided, but found "
-                f"{speculative_model=} and {num_speculative_tokens=}.")
-
         if (speculative_disable_by_batch_size is not None
                 and speculative_disable_by_batch_size < 2):
             raise ValueError("Expect the batch size threshold of disabling "
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
 
-        assert (speculative_model is not None
-                and num_speculative_tokens is not None)
-
         if enable_chunked_prefill:
             raise ValueError(
                 "Speculative decoding and chunked prefill are "
@@ -912,6 +909,27 @@ class SpeculativeConfig:
                 max_logprobs=target_model_config.max_logprobs,
             )
 
+            if (draft_model_config.hf_config.model_type == "mlp_speculator"
+                    and target_parallel_config.world_size != 1):
+                # MLPSpeculator TP support will be added very soon
+                raise ValueError(
+                    "Speculative decoding with mlp_speculator models does not "
+                    "yet support distributed inferencing (TP > 1).")
+
+            n_predict = getattr(draft_model_config.hf_config, "n_predict",
+                                None)
+            if n_predict is not None:
+                if num_speculative_tokens is None:
+                    # Default to max value defined in draft model config.
+                    num_speculative_tokens = n_predict
+                elif num_speculative_tokens > n_predict:
+                    # Verify provided value doesn't exceed the maximum
+                    # supported by the draft model.
+                    raise ValueError(
+                        "Expected both speculative_model and "
+                        "num_speculative_tokens to be provided, but found "
+                        f"{speculative_model=} and {num_speculative_tokens=}.")
+
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
                     speculative_max_model_len,
@@ -923,6 +941,12 @@ class SpeculativeConfig:
                 SpeculativeConfig.create_draft_parallel_config(
                     target_parallel_config))
 
+        if num_speculative_tokens is None:
+            raise ValueError(
+                "num_speculative_tokens must be provided with "
+                "speculative_model unless the draft model config contains an "
+                "n_predict parameter.")
+
         return SpeculativeConfig(
             draft_model_config,
             draft_parallel_config,
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index f9ec72096..5afb2e1d4 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -60,6 +60,7 @@ _GENERATION_MODELS = {
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
+    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 
 _EMBEDDING_MODELS = {
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
new file mode 100644
index 000000000..b18269777
--- /dev/null
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -0,0 +1,143 @@
+import math
+from typing import Iterable, List, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.sequence import SamplerOutput
+
+
+class MLPSpeculatorLayerNorm(nn.Module):
+    """
+    A L2 normalization implementation
+    ...
+    Args
+    ----
+    normalized_shape : int
+        Dimensionality of input data (size of final tensor axis)
+    eps : float
+        Safety term to prevent division by zero. Make sure the chosen value
+         fits in the range of your encoding scheme
+         (i.e. fp16 requires eps >= 6e-8).
+    """
+
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-06,
+    ):
+        super(MLPSpeculatorLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.empty(normalized_shape))
+        self.bias = nn.Parameter(torch.empty(normalized_shape))
+        self.eps = eps
+
+    def forward(self, x):
+        xf = x
+        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
+        x = xf.type_as(x)
+        x = self.weight * x
+        x = x + self.bias
+        return x
+
+
+class MLPSpeculator(nn.Module):
+
+    def __init__(self, config, **kwargs) -> None:
+        super().__init__()
+        self.n_predict = config.n_predict
+        self.vocab_size = config.vocab_size
+        self.emb_dim = config.emb_dim
+        self.inner_dim = config.inner_dim if config.inner_dim != 0 \
+            else config.emb_dim
+
+        self.max_speculative_tokens = getattr(config, "max_speculative_tokens",
+                                              self.n_predict)
+
+        self.emb = nn.ModuleList([
+            VocabParallelEmbedding(config.vocab_size,
+                                   self.inner_dim,
+                                   org_num_embeddings=config.vocab_size)
+            for _ in range(self.max_speculative_tokens)
+        ])
+
+        self.proj = nn.ModuleList([
+            nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
+                      self.inner_dim,
+                      bias=False) for i in range(self.max_speculative_tokens)
+        ])
+
+        self.head = nn.ModuleList([
+            nn.Linear(self.inner_dim, self.vocab_size, bias=False)
+            for _ in range(self.max_speculative_tokens)
+        ])
+        self.ln = nn.ModuleList([
+            MLPSpeculatorLayerNorm(self.inner_dim)
+            for _ in range(self.max_speculative_tokens)
+        ])
+
+        self.state_weight = 0.5**(0.5 / config.n_predict)
+        self.emb_weight = math.sqrt(
+            (1 - self.state_weight**2) * (self.inner_dim / 2))
+        self.activation = nn.GELU()
+        self.config = config
+        self.logits_processor = LogitsProcessor(config.vocab_size,
+                                                config.vocab_size, 1.0)
+        self.sampler = Sampler()
+
+    def generate_proposals(
+        self,
+        input_ids: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        num_predict_tokens: int,
+        sampling_metadata: SamplingMetadata,
+    ) -> List[SamplerOutput]:
+        if num_predict_tokens > self.max_speculative_tokens:
+            raise ValueError(f"Max speculative tokens for model is "
+                             f"{self.max_speculative_tokens}, but "
+                             f"{num_predict_tokens} were requested")
+
+        # b x 1 x d
+        previous_hidden_states = previous_hidden_states.unsqueeze(1)
+
+        # b x 1
+        last_tokens = input_ids.unsqueeze(1)
+
+        next_tokens = []
+
+        for head_index in range(num_predict_tokens):
+
+            # Project and predict
+            z = self.emb[head_index](last_tokens)  # b k d
+            states = self.proj[head_index](previous_hidden_states)
+
+            # Weighted add of state_weight*state and emb_weight*z
+            # Let subsequent LN take care of denominator
+            # state_weight is close to 1, so shouldn't be any precision issues
+            states.add_(z, alpha=self.emb_weight / self.state_weight)
+
+            states = self.activation(self.ln[head_index](states))  # b k d
+            # TODO: not yet supporting top_k_tokens_per_head
+            previous_hidden_states = states
+
+            logits = self.logits_processor(self.head[head_index].weight,
+                                           states, sampling_metadata)
+
+            output = self.sampler(logits.flatten(0, 1), sampling_metadata)
+            last_tokens = output.sampled_token_ids
+            next_tokens.append(output)
+
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            param = params_dict[name.replace("speculator.", "")]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 38d3349f2..287e1b9df 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -794,6 +794,9 @@ class SamplerOutput:
     # Spec decode metrics populated by workers.
     spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
 
+    # Optional last hidden states from the model.
+    hidden_states: Optional[torch.Tensor] = None
+
     def __getitem__(self, idx: int):
         return self.outputs[idx]
 
@@ -842,6 +845,46 @@ class PoolerOutput:
                           self.__class__) and self.outputs == other.outputs
 
 
+def get_all_seq_ids(
+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
+
+
+class HiddenStates:
+    """Hidden states corresponding to in-progress sequences.
+    Used in speculative decoding to pass hidden states from
+    the target model to the proposer model in the subsequent step.
+
+    seq_ids are the sequence ids of each entry of the batch
+    dimension of the hidden_states tensor"""
+
+    def __init__(self, seq_group_metadata_list: List[SequenceGroupMetadata],
+                 hidden_states: torch.Tensor):
+        assert len(seq_group_metadata_list) == len(hidden_states)
+        self.seq_ids: List[int] = get_all_seq_ids(seq_group_metadata_list)
+        self.hidden_states: torch.Tensor = hidden_states
+
+    def update(self, seq_group_metadata_list: List[SequenceGroupMetadata],
+               hidden_states: torch.Tensor) -> None:
+        """Update hidden states from target model invocation."""
+        assert len(seq_group_metadata_list) == len(hidden_states)
+        self.seq_ids.extend(get_all_seq_ids(seq_group_metadata_list))
+        self.hidden_states = torch.cat([self.hidden_states, hidden_states])
+
+    def prune(self,
+              seq_group_metadata_list: List[SequenceGroupMetadata]) -> None:
+        """Prune to provided list of sequence ids."""
+        seq_ids = get_all_seq_ids(seq_group_metadata_list)
+        if seq_ids != self.seq_ids:
+            # Batch contents changed - prune removed sequences.
+            index = [self.seq_ids.index(seq_id) for seq_id in seq_ids]
+            self.hidden_states = self.hidden_states[index]
+            self.seq_ids = seq_ids
+
+
 @dataclass
 class ExecuteModelRequest:
     """The model execution request."""
@@ -857,6 +900,8 @@ class ExecuteModelRequest:
     num_lookahead_slots: int = 0
     # The number of requests in the running queue.
     running_queue_size: int = 0
+    # Optional hidden states from prior step.
+    previous_hidden_states: Optional[HiddenStates] = None
 
     def clone(
         self, seq_group_metadata_list: List[SequenceGroupMetadata]
@@ -869,4 +914,5 @@ class ExecuteModelRequest:
             blocks_to_copy=self.blocks_to_copy.copy(),
             num_lookahead_slots=self.num_lookahead_slots,
             running_queue_size=self.running_queue_size,
+            previous_hidden_states=self.previous_hidden_states,
         )
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 1bde04208..405165563 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -4,11 +4,10 @@ from typing import Iterator, List, Tuple
 import torch
 
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata)
+                           SequenceGroupMetadata, get_all_seq_ids)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
-from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
-                                   sampler_output_to_torch,
+from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch,
                                    split_batch_by_proposal_len)
 from vllm.worker.worker_base import WorkerBase
 
@@ -98,6 +97,7 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
             probs=all_probs,
             token_ids=all_tokens,
             logprobs=spec_logprobs,
+            hidden_states=target_sampler_output.hidden_states,
         )
 
     def _expand_batch(
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index 72d7818eb..d236fc0f2 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from typing import Optional
 
 import torch
 
@@ -46,6 +47,9 @@ class SpeculativeScores:
     # tokens and also non-speculative normal decoding.
     token_ids: torch.Tensor
 
+    # Optional last hidden states from the scoring model.
+    hidden_states: Optional[torch.Tensor] = None
+
     def __repr__(self):
         return (f"SpeculativeScores("
                 f"probs={self.probs.shape}, "
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
new file mode 100644
index 000000000..0926e13be
--- /dev/null
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -0,0 +1,87 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.model_executor import SamplingMetadata
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+                           SequenceGroupMetadata)
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
+from vllm.worker.model_runner import ModelInput
+
+
+class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker):
+    """Worker for MLPSpeculator models.
+
+    Not currently compatible with LoRA or chunked prefill.
+    """
+
+    @torch.inference_mode()
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+    ) -> Tuple[List[SamplerOutput], bool]:
+        """Run the model forward pass to generate sample_len future tokens.
+        Returns the list of sampler output, one per layer, along with indicator
+        of whether torch tensor in sampler output need to be transposed in
+        latter sampler_output_to_torch logic.
+
+        For mlp spec worker, this indicator shall be True.
+        """
+        self._raise_if_unsupported(execute_model_req)
+
+        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
+        (input_tokens, seq_lens,
+         query_lens) = self._prepare_input_tensors(seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list, seq_lens, query_lens, self.device,
+            self.model_runner.pin_memory)
+
+        model_outputs = self.model_runner.model.generate_proposals(
+            input_ids=input_tokens,
+            previous_hidden_states=execute_model_req.previous_hidden_states.
+            hidden_states,
+            num_predict_tokens=sample_len,
+            sampling_metadata=sampling_metadata)
+
+        assert len(model_outputs) == sample_len
+
+        return model_outputs, True
+
+    def _prepare_input_tensors(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Tuple[torch.Tensor, List[int], List[int]]:
+        if not seq_group_metadata_list:
+            return ModelInput.empty(self.device)
+
+        input_tokens: List[int] = []
+        seq_lens: List[int] = []
+        query_lens: List[int] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            is_prompt = seq_group_metadata.is_prompt
+
+            for seq_data in seq_group_metadata.seq_data.values():
+                seq_data_len = seq_data.get_len()
+                if is_prompt:
+                    context_len = seq_data.get_num_computed_tokens()
+                    seq_len = min(
+                        seq_data_len,
+                        context_len + seq_group_metadata.token_chunk_size)
+                    tokens = seq_data.get_token_ids()[context_len:seq_len]
+                    seq_lens.append(seq_len)
+                    input_tokens.extend(tokens)
+                    query_lens.append(seq_len - context_len)
+                else:
+                    seq_lens.append(seq_data_len)
+                    input_tokens.append(seq_data.get_last_token_id())
+                    query_lens.append(1)
+
+        input_tokens_tensor = torch.tensor(input_tokens,
+                                           dtype=torch.long,
+                                           device=self.device)
+        return input_tokens_tensor, seq_lens, query_lens
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 03fad5663..58d3461a2 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -8,16 +8,18 @@ from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
-                           SamplerOutput, SequenceGroupMetadata)
+                           HiddenStates, SamplerOutput, SequenceGroupMetadata,
+                           get_all_seq_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.metrics import AsyncMetricsCollector
+from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.util import (create_sequence_group_output,
-                                   get_all_num_logprobs, get_all_seq_ids,
+                                   get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
 from vllm.worker.worker import Worker
@@ -104,6 +106,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
+        elif draft_worker_kwargs[
+                "model_config"].hf_config.model_type == "mlp_speculator":
+            proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
+            disable_bonus_tokens = False
         else:
             proposer_worker = MultiStepWorker(**draft_worker_kwargs)
 
@@ -155,6 +161,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         # Lazy initiazliation.
         self.scorer: SpeculativeScorer
 
+        # Hidden states from target model to pass to proposer
+        # in the subsequent step.
+        self.previous_hidden_states: Optional[HiddenStates] = None
+
     def init_device(self) -> None:
         """Initialize both scorer and proposer models.
         """
@@ -337,6 +347,16 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         assert len(sampler_output) == 1
         sampler_output = sampler_output[0]
 
+        # Store hidden states from target model execution.
+        hidden_states = sampler_output.hidden_states
+        if hidden_states is not None:
+            if self.previous_hidden_states is None:
+                self.previous_hidden_states = HiddenStates(
+                    execute_model_req.seq_group_metadata_list, hidden_states)
+            else:
+                self.previous_hidden_states.update(
+                    execute_model_req.seq_group_metadata_list, hidden_states)
+
         # Clear device tensors from sampler output. This reduces communication
         # overhead when the engine runs in a different process than the workers.
         sampler_output.probs = None
@@ -383,6 +403,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         """
         assert num_lookahead_slots == execute_model_req.num_lookahead_slots
 
+        # Pass last hidden states from target model to proposer
+        execute_model_req.previous_hidden_states = self.previous_hidden_states
+        self.previous_hidden_states = None
+
         # Generate proposals using draft worker.
         proposals = self.proposer_worker.get_spec_proposals(execute_model_req)
 
@@ -466,6 +490,20 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
 
+        hidden_states = proposal_scores.hidden_states
+        if hidden_states is not None:
+            # Contract hidden states based on accepted tokens
+            hs_size = hidden_states.shape[1]
+            hidden_states = hidden_states.reshape(-1, max_proposal_len + 1,
+                                                  hs_size)
+            accepted_index = accepted_token_ids + 1  # Convert -1 to 0
+            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)
+            index = accepted_index[:, None, None].expand(-1, 1, hs_size)
+            hidden_states = hidden_states.gather(1, index).squeeze(1)  # b x d
+            # Store hidden states from target model for subsequent decode step
+            self.previous_hidden_states = HiddenStates(seq_group_metadata_list,
+                                                       hidden_states)
+
         return accepted_token_ids, logprobs
 
     def _create_output_sampler_list(
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 278db94bf..d3e280e68 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -65,9 +65,13 @@ class Top1Proposer(SpeculativeProposer):
             # token_ids is like [batch] format in proposal_len size list,
             # while if it is false, the format would be [proposal_len]
             # in batch size list
+            hidden_states = execute_model_req.previous_hidden_states
+            if hidden_states is not None:
+                hidden_states.prune(nonzero_proposal_len_seqs)
             nonzero_execute_model_req = ExecuteModelRequest(
                 seq_group_metadata_list=nonzero_proposal_len_seqs,
                 num_lookahead_slots=proposal_len,
+                previous_hidden_states=hidden_states,
             )
             maybe_sampler_output, transposed = self._worker.sampler_output(
                 execute_model_req=nonzero_execute_model_req,
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 9bbe3f8d1..80710419e 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -10,14 +10,6 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
 SeqId = int
 
 
-def get_all_seq_ids(
-        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]:
-    """Given a list of SequenceGroupMetadata, create a list of all
-    sequence ids.
-    """
-    return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
-
-
 def get_all_num_logprobs(
         seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
     """Given a list of SequenceGroupMetadata, create a list of all num_logprobs.
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index ada840182..60fc756a1 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,3 +1,4 @@
+import contextlib
 from typing import Dict, Optional, Type
 
 from transformers import PretrainedConfig
@@ -5,7 +6,13 @@ from transformers import PretrainedConfig
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             JAISConfig, MPTConfig, RWConfig)
+                                             JAISConfig, MLPSpeculatorConfig,
+                                             MPTConfig, RWConfig)
+
+if VLLM_USE_MODELSCOPE:
+    from modelscope import AutoConfig
+else:
+    from transformers import AutoConfig
 
 logger = init_logger(__name__)
 
@@ -16,8 +23,13 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
     "jais": JAISConfig,
+    "mlp_speculator": MLPSpeculatorConfig,
 }
 
+for name, cls in _CONFIG_REGISTRY.items():
+    with contextlib.suppress(ValueError):
+        AutoConfig.register(name, cls)
+
 
 def get_config(model: str,
                trust_remote_code: bool,
@@ -26,10 +38,6 @@ def get_config(model: str,
                rope_scaling: Optional[dict] = None,
                rope_theta: Optional[float] = None) -> PretrainedConfig:
     try:
-        if VLLM_USE_MODELSCOPE:
-            from modelscope import AutoConfig
-        else:
-            from transformers import AutoConfig
         config = AutoConfig.from_pretrained(
             model,
             trust_remote_code=trust_remote_code,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 0e4869288..d8170858c 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -5,6 +5,7 @@ from vllm.transformers_utils.configs.dbrx import DbrxConfig
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 __all__ = [
@@ -13,4 +14,5 @@ __all__ = [
     "MPTConfig",
     "RWConfig",
     "JAISConfig",
+    "MLPSpeculatorConfig",
 ]
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
new file mode 100644
index 000000000..dd1d92b86
--- /dev/null
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -0,0 +1,50 @@
+from typing import List, Optional
+
+from transformers import PretrainedConfig
+
+
+class MLPSpeculatorConfig(PretrainedConfig):
+    model_type = "mlp_speculator"
+
+    attribute_map = {
+        "hidden_size": "emb_dim",
+    }
+
+    def __init__(self,
+                 vocab_size: int = 32000,
+                 emb_dim: int = 4096,
+                 inner_dim: int = 0,
+                 n_predict: int = 3,
+                 top_k_tokens_per_head: Optional[List[int]] = None,
+                 n_candidates: int = 5,
+                 **kwargs):
+        """
+        Initialize an MLPSpeculatorConfig
+
+        Args:
+            vocab_size: int
+                the model vocab size
+            emb_dim: int
+                the model embedding dimension
+            inner_dim: int
+                the inner dimension of the model. If 0, will be the emb_dim.
+            n_predict: int
+                the number of lookaheads for the speculator
+            top_k_tokens_per_head: List[int]
+                Number of tokens to consider from each head when forming the
+                candidate tree.
+                For each candidate branch in the tree, head n produces topk[n]
+                additional sub-branches.
+            n_candidates: int
+                number of child candidates to create per sequence
+        """
+        if top_k_tokens_per_head is None:
+            top_k_tokens_per_head = [5, 4, 3]
+        assert len(top_k_tokens_per_head) == n_predict
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.inner_dim = inner_dim
+        self.n_predict = n_predict
+        self.top_k_tokens_per_head = top_k_tokens_per_head
+        self.n_candidates = n_candidates
+        super().__init__(**kwargs)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index d0baa4337..e24835a1e 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -86,6 +86,7 @@ class ModelRunner:
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         vision_language_config: Optional[VisionLanguageConfig] = None,
+        return_hidden_states: bool = False,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -96,6 +97,7 @@ class ModelRunner:
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
         self.vision_language_config = vision_language_config
+        self.return_hidden_states = return_hidden_states
 
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
@@ -116,15 +118,17 @@ class ModelRunner:
         self.graph_block_tables = np.zeros(
             (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
             dtype=np.int32)
+        num_attn_heads = self.model_config.get_num_attention_heads(
+            self.parallel_config)
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
+            num_attn_heads,
             self.model_config.get_head_size(),
             self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
-        )
+        ) if num_attn_heads else None
 
         # Create processor for multi-modal data
         if self.vision_language_config is not None:
@@ -762,11 +766,19 @@ class ModelRunner:
             return None
 
         # Sample the next token.
-        output = self.model.sample(
+        output: SamplerOutput = self.model.sample(
             logits=logits,
             sampling_metadata=sampling_metadata,
         )
 
+        if self.return_hidden_states:
+            # we only need to pass hidden states of most recent token
+            assert seq_group_metadata_list is not None
+            if seq_group_metadata_list[0].is_prompt:
+                hidden_states = hidden_states.index_select(
+                    0, sampling_metadata.selected_token_indices)
+            output.hidden_states = hidden_states
+
         return output
 
     @torch.inference_mode()
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f9b8a065a..e334ffbb7 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -70,6 +70,14 @@ class Worker(WorkerBase):
             assert not self.lora_config, (
                 "To be tested: vision language model with LoRA settings.")
 
+        # Return hidden states from target model if the draft model is an
+        # mlp_speculator
+        speculative_args = {} if speculative_config is None \
+            or (speculative_config.draft_model_config.model ==
+                model_config.model) \
+              or (speculative_config.draft_model_config.hf_config.model_type !=
+                  "mlp_speculator") else {"return_hidden_states": True}
+
         ModelRunnerClass = (EmbeddingModelRunner if
                             self.model_config.embedding_mode else ModelRunner)
         self.model_runner = ModelRunnerClass(
@@ -83,6 +91,7 @@ class Worker(WorkerBase):
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
             vision_language_config=vision_language_config,
+            **speculative_args,
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-- 
GitLab


From 1f5674218f968dec625d0995fe5cd5d626db9188 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Fri, 21 Jun 2024 08:55:41 +0800
Subject: [PATCH 117/376] [Kernel] Add punica dimension for Qwen2 LoRA (#5441)

---
 csrc/punica/bgmv/bgmv_config.h | 38 ++++++++++++++++++++++++++++++++--
 tests/lora/test_punica.py      | 17 +++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index c38db2dcd..cb6694b30 100755
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -16,15 +16,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 512) \
     f(in_T, out_T, W_T, narrow, 640) \
     f(in_T, out_T, W_T, narrow, 768) \
+    f(in_T, out_T, W_T, narrow, 896) \
     f(in_T, out_T, W_T, narrow, 1024) \
     f(in_T, out_T, W_T, narrow, 1152) \
+    f(in_T, out_T, W_T, narrow, 1216) \
     f(in_T, out_T, W_T, narrow, 1280) \
     f(in_T, out_T, W_T, narrow, 1536) \
     f(in_T, out_T, W_T, narrow, 1664) \
     f(in_T, out_T, W_T, narrow, 1728) \
     f(in_T, out_T, W_T, narrow, 1792) \
     f(in_T, out_T, W_T, narrow, 2048) \
+    f(in_T, out_T, W_T, narrow, 2240) \
     f(in_T, out_T, W_T, narrow, 2304) \
+    f(in_T, out_T, W_T, narrow, 2368) \
+    f(in_T, out_T, W_T, narrow, 2432) \
     f(in_T, out_T, W_T, narrow, 2560) \
     f(in_T, out_T, W_T, narrow, 2752) \
     f(in_T, out_T, W_T, narrow, 2816) \
@@ -32,8 +37,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 3328) \
     f(in_T, out_T, W_T, narrow, 3456) \
     f(in_T, out_T, W_T, narrow, 3584) \
+    f(in_T, out_T, W_T, narrow, 3712) \
     f(in_T, out_T, W_T, narrow, 4096) \
+    f(in_T, out_T, W_T, narrow, 4480) \
     f(in_T, out_T, W_T, narrow, 4608) \
+    f(in_T, out_T, W_T, narrow, 4736) \
+    f(in_T, out_T, W_T, narrow, 4864) \
     f(in_T, out_T, W_T, narrow, 5120) \
     f(in_T, out_T, W_T, narrow, 5504) \
     f(in_T, out_T, W_T, narrow, 5632) \
@@ -43,8 +52,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 6848) \
     f(in_T, out_T, W_T, narrow, 6912) \
     f(in_T, out_T, W_T, narrow, 7168) \
+    f(in_T, out_T, W_T, narrow, 7424) \
     f(in_T, out_T, W_T, narrow, 8192) \
+    f(in_T, out_T, W_T, narrow, 8960) \
     f(in_T, out_T, W_T, narrow, 9216) \
+    f(in_T, out_T, W_T, narrow, 9472) \
     f(in_T, out_T, W_T, narrow, 10240) \
     f(in_T, out_T, W_T, narrow, 11008) \
     f(in_T, out_T, W_T, narrow, 11264) \
@@ -52,8 +64,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 13696) \
     f(in_T, out_T, W_T, narrow, 13824) \
     f(in_T, out_T, W_T, narrow, 14336) \
+    f(in_T, out_T, W_T, narrow, 14784) \
+    f(in_T, out_T, W_T, narrow, 14848) \
     f(in_T, out_T, W_T, narrow, 15360) \
     f(in_T, out_T, W_T, narrow, 16384) \
+    f(in_T, out_T, W_T, narrow, 18944) \
     f(in_T, out_T, W_T, narrow, 20480) \
     f(in_T, out_T, W_T, narrow, 22016) \
     f(in_T, out_T, W_T, narrow, 22528) \
@@ -61,6 +76,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 27392) \
     f(in_T, out_T, W_T, narrow, 27648) \
     f(in_T, out_T, W_T, narrow, 28672) \
+    f(in_T, out_T, W_T, narrow, 29568) \
+    f(in_T, out_T, W_T, narrow, 29696) \
     f(in_T, out_T, W_T, narrow, 32000) \
     f(in_T, out_T, W_T, narrow, 32256) \
     f(in_T, out_T, W_T, narrow, 32512) \
@@ -85,9 +102,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
 // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
 // and vllm/tests/lora/test_punica.py
 
-// Used for defining kernels going from the variety of 
+// Used for defining kernels going from the variety of
 // dim in to the narrow dim out
-    // Using it for the fully sharded column 
+    // Using it for the fully sharded column
     // parallel LoRA A which splits the rank dim
 #define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
     f(in_T, out_T, W_T, 128, narrow) \
@@ -95,15 +112,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 512, narrow) \
     f(in_T, out_T, W_T, 640, narrow) \
     f(in_T, out_T, W_T, 768, narrow) \
+    f(in_T, out_T, W_T, 896, narrow) \
     f(in_T, out_T, W_T, 1024, narrow) \
     f(in_T, out_T, W_T, 1152, narrow) \
+    f(in_T, out_T, W_T, 1216, narrow) \
     f(in_T, out_T, W_T, 1280, narrow) \
     f(in_T, out_T, W_T, 1536, narrow) \
     f(in_T, out_T, W_T, 1664, narrow) \
     f(in_T, out_T, W_T, 1728, narrow) \
     f(in_T, out_T, W_T, 1792, narrow) \
     f(in_T, out_T, W_T, 2048, narrow) \
+    f(in_T, out_T, W_T, 2240, narrow) \
     f(in_T, out_T, W_T, 2304, narrow) \
+    f(in_T, out_T, W_T, 2368, narrow) \
+    f(in_T, out_T, W_T, 2432, narrow) \
     f(in_T, out_T, W_T, 2560, narrow) \
     f(in_T, out_T, W_T, 2752, narrow) \
     f(in_T, out_T, W_T, 2816, narrow) \
@@ -111,8 +133,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 3328, narrow) \
     f(in_T, out_T, W_T, 3456, narrow) \
     f(in_T, out_T, W_T, 3584, narrow) \
+    f(in_T, out_T, W_T, 3712, narrow) \
     f(in_T, out_T, W_T, 4096, narrow) \
+    f(in_T, out_T, W_T, 4480, narrow) \
     f(in_T, out_T, W_T, 4608, narrow) \
+    f(in_T, out_T, W_T, 4736, narrow) \
+    f(in_T, out_T, W_T, 4864, narrow) \
     f(in_T, out_T, W_T, 5120, narrow) \
     f(in_T, out_T, W_T, 5504, narrow) \
     f(in_T, out_T, W_T, 5632, narrow) \
@@ -122,8 +148,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 6848, narrow) \
     f(in_T, out_T, W_T, 6912, narrow) \
     f(in_T, out_T, W_T, 7168, narrow) \
+    f(in_T, out_T, W_T, 7424, narrow) \
     f(in_T, out_T, W_T, 8192, narrow) \
+    f(in_T, out_T, W_T, 8960, narrow) \
     f(in_T, out_T, W_T, 9216, narrow) \
+    f(in_T, out_T, W_T, 9472, narrow) \
     f(in_T, out_T, W_T, 10240, narrow) \
     f(in_T, out_T, W_T, 11008, narrow) \
     f(in_T, out_T, W_T, 11264, narrow) \
@@ -131,8 +160,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 13696, narrow) \
     f(in_T, out_T, W_T, 13824, narrow) \
     f(in_T, out_T, W_T, 14336, narrow) \
+    f(in_T, out_T, W_T, 14784, narrow) \
+    f(in_T, out_T, W_T, 14848, narrow) \
     f(in_T, out_T, W_T, 15360, narrow) \
     f(in_T, out_T, W_T, 16384, narrow) \
+    f(in_T, out_T, W_T, 18944, narrow) \
     f(in_T, out_T, W_T, 20480, narrow) \
     f(in_T, out_T, W_T, 22016, narrow) \
     f(in_T, out_T, W_T, 22528, narrow) \
@@ -140,6 +172,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 27392, narrow) \
     f(in_T, out_T, W_T, 27648, narrow) \
     f(in_T, out_T, W_T, 28672, narrow) \
+    f(in_T, out_T, W_T, 29568, narrow) \
+    f(in_T, out_T, W_T, 29696, narrow) \
     f(in_T, out_T, W_T, 32000, narrow) \
     f(in_T, out_T, W_T, 32256, narrow) \
     f(in_T, out_T, W_T, 32512, narrow) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index dae1d5687..110c9b243 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -49,21 +49,30 @@ H1 = H2 = [
     128,
     256,
     512,
+    896,
     1024,
     1152,
+    1216,
     1280,
     1536,
     1664,
     2048,
+    2240,
     2304,
+    2368,
+    2432,
     2560,
     2752,
     3072,
     3328,
     3456,
     3584,
+    3712,
     4096,
+    4480,
     4608,
+    4736,
+    4864,
     5120,
     5504,
     5632,
@@ -73,19 +82,27 @@ H1 = H2 = [
     6848,
     6912,
     7168,
+    7424,
     8192,
+    8960,
     9216,
+    9472,
     10240,
     11008,
     11264,
     13824,
     14336,
+    14784,
+    14848,
     15360,
+    18944,
     22016,
     22528,
     24576,
     27392,
     27648,
+    29568,
+    29696,
     32000,
     32256,
     32512,
-- 
GitLab


From c35e4a3dd74fa5952b04354a3c7cfd0ed09e2eb0 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Thu, 20 Jun 2024 21:45:34 -0700
Subject: [PATCH 118/376] [BugFix] Fix test_phi3v.py (#5725)

---
 tests/conftest.py          |  4 +++-
 tests/models/test_phi3v.py | 10 ++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 5bbfd87f0..67885b932 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -233,11 +233,13 @@ class HfRunner:
         prompts: List[str],
         max_tokens: int,
         images: Optional[List[Image.Image]] = None,
+        **kwargs,
     ) -> List[Tuple[List[int], str]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
-                                images=images)
+                                images=images,
+                                **kwargs)
 
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 1732e8f08..234547598 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -77,7 +77,7 @@ if is_cpu():
 # numeric difference for longer context and test can't pass
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
@@ -95,9 +95,11 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     hf_model_kwargs = {"_attn_implementation": "eager"}
     with hf_runner(model_id, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
+        hf_outputs = hf_model.generate_greedy(
+            HF_IMAGE_PROMPTS,
+            max_tokens,
+            images=hf_images,
+            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
 
     vllm_image_prompts = [
         p.replace("<|image_1|>",
-- 
GitLab


From 67005a07bc0991211ba2acccb3e56c72a47f9def Mon Sep 17 00:00:00 2001
From: Jee Li <pandaleefree@163.com>
Date: Fri, 21 Jun 2024 12:46:28 +0800
Subject: [PATCH 119/376] [Bugfix] Add  fully sharded layer for
 QKVParallelLinearWithLora (#5665)

Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
---
 tests/lora/test_baichuan.py       | 14 +++++---
 tests/lora/test_layers.py         |  7 ++--
 vllm/lora/fully_sharded_layers.py | 58 +++++++++++++++++++++++++++++--
 vllm/lora/layers.py               | 36 +++++++++++--------
 vllm/lora/utils.py                |  4 ++-
 5 files changed, 93 insertions(+), 26 deletions(-)

diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index e1b81655c..56cec4db8 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -64,7 +64,8 @@ def test_baichuan_lora(baichuan_lora_files):
 
 
 @pytest.mark.skip("Requires multiple GPUs")
-def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_baichuan_tensor_parallel_equality(baichuan_lora_files, fully_sharded):
     # Cannot use as it will initialize torch.cuda too early...
     # if torch.cuda.device_count() < 4:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
@@ -75,7 +76,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
                        max_loras=4,
                        max_lora_rank=64,
                        tensor_parallel_size=1,
-                       trust_remote_code=True)
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
     output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
 
     del llm_tp1
@@ -87,7 +89,8 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
                        max_loras=4,
                        max_lora_rank=64,
                        tensor_parallel_size=2,
-                       trust_remote_code=True)
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
     output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
 
     del llm_tp2
@@ -101,10 +104,11 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
                        max_loras=4,
                        max_lora_rank=64,
                        tensor_parallel_size=4,
-                       trust_remote_code=True)
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
     output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
 
     del llm_tp4
     cleanup()
 
-    assert output_tp1 == output_tp4
\ No newline at end of file
+    assert output_tp1 == output_tp4
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 4b489670f..2e51e95a3 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -12,7 +12,8 @@ from vllm.config import LoRAConfig
 from vllm.lora.fully_sharded_layers import (
     ColumnParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
+    MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
+    RowParallelLinearWithShardedLoRA)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
@@ -684,7 +685,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                        bias=False,
                                        params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = QKVParallelLinearWithLora(linear)
+            lora_linear = QKVParallelLinearWithLora(
+                linear
+            ) if not fully_shard else QKVParallelLinearWithShardedLora(linear)
 
         @dataclass
         class FakeConfig:
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index ffdc32b73..d27171f72 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -12,6 +12,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLora,
+                              QKVParallelLinearWithLora,
                               RowParallelLinearWithLoRA)
 from vllm.lora.punica import bgmv, dispatch_bgmv_low_level
 
@@ -90,11 +91,11 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
 def _mcp_apply(x, bias, layer):
     """
     MergedColumnParallelLinearWithShardedLoRA and 
-    QKVParallelLinearWithShardedLora share the same 
+    MergedQKVParallelLinearWithShardedLora share the same 
     LoRa weight application method.
     
     The main difference is the step by shard_size for lora_b which can
-    vary for QKVParallelLinearWithShardedLora but is constant for 
+    vary for MergedQKVParallelLinearWithShardedLora but is constant for 
     MergedColumnParallelLinearWithShardedLoRA.
     """
     # expecting 2 for column parallel and 3 for qkv
@@ -167,7 +168,7 @@ class MergedColumnParallelLinearWithShardedLoRA(
         )
 
 
-class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
+class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
     """
     Differs from QKVParallelLinearWithLora by slicing the 
     LoRA A's also.
@@ -175,6 +176,57 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
     Based on S-LoRA, slicing happens along the rank dim.
     """
 
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.lora_a_stacked.shape[2]
+        start_idx = tp_rank * shard_size
+        lora_a = lora_a[:, start_idx:start_idx + shard_size]
+        return lora_a
+
+    def apply(self, x: torch.Tensor,
+              bias: Optional[torch.Tensor]) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+
+        x = x.view(-1, x.shape[-1])
+        output, out_orig_shape = output.view(-1,
+                                             output.shape[-1]), output.shape
+        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
+                             dtype=torch.float32,
+                             device=x.device)
+
+        bgmv(buffer, x, self.lora_a_stacked,
+             self.indices[:self.indices_len[0]], 0, 1.0)
+        buffer = tensor_model_parallel_all_gather(buffer)
+        bgmv(output, buffer, self.lora_b_stacked,
+             self.indices[:self.indices_len[0]], 0, 1.0)
+        # now have column partitioned output
+
+        output = output.view(*out_orig_shape)
+        return output
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(cls, source_layer: nn.Module,
+                          lora_config: LoRAConfig, packed_modules_list: List,
+                          model_config: Optional[PretrainedConfig]) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
+    """
+    Differs from MergedQKVParallelLinearWithLora by slicing the 
+    LoRA A's also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e3ab1708c..e4a23273f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -641,6 +641,24 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
         self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
                                    self.base_layer.head_size)
 
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        self.q_shard_id = tp_rank
+        self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
+        lora_b_q = lora_b[:, self.q_proj_shard_size *
+                          self.q_shard_id:self.q_proj_shard_size *
+                          (self.q_shard_id + 1)]
+        k_offset = self.q_proj_total_size
+        lora_b_k = lora_b[:, k_offset +
+                          self.kv_proj_shard_size * self.kv_shard_id:k_offset +
+                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        v_offset = k_offset + self.kv_proj_total_size
+        lora_b_v = lora_b[:, v_offset +
+                          self.kv_proj_shard_size * self.kv_shard_id:v_offset +
+                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
+        return lora_b
+
     def set_lora(
         self,
         index: int,
@@ -650,21 +668,8 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
     ):
         self.reset_lora(index)
         if self.tp_size > 1:
-            tp_rank = get_tensor_model_parallel_rank()
-            self.q_shard_id = tp_rank
-            self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
-            lora_b_q = lora_b[:, self.q_proj_shard_size *
-                              self.q_shard_id:self.q_proj_shard_size *
-                              (self.q_shard_id + 1)]
-            k_offset = self.q_proj_total_size
-            lora_b_k = lora_b[:, k_offset + self.kv_proj_shard_size *
-                              self.kv_shard_id:k_offset +
-                              self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-            v_offset = k_offset + self.kv_proj_total_size
-            lora_b_v = lora_b[:, v_offset + self.kv_proj_shard_size *
-                              self.kv_shard_id:v_offset +
-                              self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-            lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
 
         self.lora_a_stacked[index,
                             0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
@@ -674,6 +679,7 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
                                 lora_b.T, non_blocking=True)
 
     @classmethod
+    @_not_fully_sharded_can_replace
     def can_replace_layer(cls, source_layer: nn.Module,
                           lora_config: LoRAConfig, packed_modules_list: List,
                           model_config: Optional[PretrainedConfig]) -> bool:
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 4a86c16cf..ab3b99eee 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -8,7 +8,8 @@ from vllm.logger import init_logger
 from vllm.lora.fully_sharded_layers import (
     ColumnParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
+    MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
+    RowParallelLinearWithShardedLoRA)
 # being imported for _all_lora_classes below
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -35,6 +36,7 @@ _all_lora_classes: Set[Type[BaseLayerWithLoRA]] = {
     RowParallelLinearWithLoRA,
     LogitsProcessorWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
+    QKVParallelLinearWithShardedLora,
     MergedColumnParallelLinearWithShardedLoRA,
     MergedQKVParallelLinearWithShardedLora,
     RowParallelLinearWithShardedLoRA,
-- 
GitLab


From d9a252bc8e8a2741d8a2997032a94208fb8f29d9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 20 Jun 2024 22:12:35 -0700
Subject: [PATCH 120/376] [Core][Distributed] add shm broadcast (#5399)

Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   4 +-
 tests/distributed/test_shm_broadcast.py       |  82 ++++++
 .../device_communicators/shm_broadcast.py     | 259 ++++++++++++++++++
 vllm/distributed/parallel_state.py            |  44 ++-
 vllm/envs.py                                  |   5 +
 5 files changed, 384 insertions(+), 10 deletions(-)
 create mode 100644 tests/distributed/test_shm_broadcast.py
 create mode 100644 vllm/distributed/device_communicators/shm_broadcast.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5e92ba3c2..c337a81d4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -28,9 +28,11 @@ steps:
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
-  command: pytest -v -s distributed/test_comm_ops.py
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
 
 - label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
new file mode 100644
index 000000000..d92900ffc
--- /dev/null
+++ b/tests/distributed/test_shm_broadcast.py
@@ -0,0 +1,82 @@
+import multiprocessing
+import random
+import time
+
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.shm_broadcast import (
+    ShmRingBuffer, ShmRingBufferIO)
+from vllm.utils import update_environment_variables
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+    for i in range(number_of_processes):
+        env = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        dist.init_process_group(backend="gloo")
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+    writer_rank = 2
+    broadcaster = ShmRingBufferIO.create_from_process_group(
+        dist.group.WORLD, 1024, 2, writer_rank)
+    if dist.get_rank() == writer_rank:
+        time.sleep(random.random())
+        broadcaster.broadcast_object(0)
+        time.sleep(random.random())
+        broadcaster.broadcast_object({})
+        time.sleep(random.random())
+        broadcaster.broadcast_object([])
+    else:
+        time.sleep(random.random())
+        a = broadcaster.broadcast_object(None)
+        time.sleep(random.random())
+        b = broadcaster.broadcast_object(None)
+        time.sleep(random.random())
+        c = broadcaster.broadcast_object(None)
+        assert a == 0
+        assert b == {}
+        assert c == []
+    dist.barrier()
+
+
+def test_shm_broadcast():
+    distributed_run(worker_fn, 4)
+
+
+def test_singe_process():
+    buffer = ShmRingBuffer(1, 1024, 4)
+    reader = ShmRingBufferIO(buffer, reader_rank=0)
+    writer = ShmRingBufferIO(buffer, reader_rank=-1)
+    writer.enqueue([0])
+    writer.enqueue([1])
+    assert reader.dequeue() == [0]
+    assert reader.dequeue() == [1]
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
new file mode 100644
index 000000000..119befcf6
--- /dev/null
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -0,0 +1,259 @@
+import pickle
+import time
+from contextlib import contextmanager
+from multiprocessing import shared_memory
+from typing import Optional
+from unittest.mock import patch
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
+
+logger = init_logger(__name__)
+
+
+class ShmRingBuffer:
+
+    def __init__(self,
+                 n_reader: int,
+                 max_chunk_bytes: int,
+                 max_chunks: int,
+                 name: Optional[str] = None):
+        """
+        A shared memory ring buffer implementation for broadcast communication.
+        Essentially, it is a queue where only one will `enqueue` and multiple
+        will `dequeue`. The max size of each item, together with the max number
+        of items that can be stored in the buffer are known in advance.
+        In this case, we don't need to synchronize the access to
+         the buffer.
+        
+        Buffer memory layout:
+                  data                                 metadata
+                    |                                      |
+                    | (current_idx)                        | (current_idx)
+                    v                                      v
+        +-------------------------------+----------------------------------------+
+        | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata |
+        +-------------------------------+----------------------------------------+
+        | max_chunks x max_chunk_bytes  | max_chunks x (1 + n_reader) bytes      |
+
+        metadata memory layout: each byte is a flag, the first byte is the written
+        flag, and the rest are reader flags. The flags are set to 0 by default.
+        +--------------+--------------+--------------+-----+--------------+
+        | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
+        +--------------+--------------+--------------+-----+--------------+
+
+        During creation, `name` is None and the buffer is created. We can pass the
+        created object to other processes by pickling it. The other processes will
+        get the name of the shared memory and open it, so that they can access the
+        same shared memory buffer.
+        """# noqa
+        self.n_reader = n_reader
+        self.metadata_size = 1 + n_reader
+        self.max_chunk_bytes = max_chunk_bytes
+        self.max_chunks = max_chunks
+        self.total_bytes_of_buffer = (self.max_chunk_bytes +
+                                      self.metadata_size) * self.max_chunks
+        self.data_offset = 0
+        self.metadata_offset = self.max_chunk_bytes * self.max_chunks
+
+        if name is None:
+            # we are creating a buffer
+            self.is_creator = True
+            self.shared_memory = shared_memory.SharedMemory(
+                create=True, size=self.total_bytes_of_buffer)
+            # initialize the metadata section to 0
+            with memoryview(self.shared_memory.buf[self.metadata_offset:]
+                            ) as metadata_buffer:
+                torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
+        else:
+            # we are opening an existing buffer
+            self.is_creator = False
+            # fix to https://stackoverflow.com/q/62748654/9191338
+            # Python incorrectly tracks shared memory even if it is not
+            # created by the process. The following patch is a workaround.
+            with patch("multiprocessing.resource_tracker.register",
+                       lambda *args, **kwargs: None):
+                self.shared_memory = shared_memory.SharedMemory(name=name)
+            assert self.shared_memory.size == self.total_bytes_of_buffer
+            with memoryview(self.shared_memory.buf[self.metadata_offset:]
+                            ) as metadata_buffer:
+                tensor = torch.frombuffer(metadata_buffer, dtype=torch.uint8)
+                assert torch.all(tensor == 0)
+
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (self.n_reader, self.max_chunk_bytes, self.max_chunks,
+             self.shared_memory.name),
+        )
+
+    def __del__(self):
+        self.shared_memory.close()
+        if self.is_creator:
+            self.shared_memory.unlink()
+
+    @contextmanager
+    def get_data(self, current_idx: int):
+        start = self.data_offset + current_idx * self.max_chunk_bytes
+        end = start + self.max_chunk_bytes
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+    @contextmanager
+    def get_metadata(self, current_idx: int):
+        start = self.metadata_offset + current_idx * self.metadata_size
+        end = start + self.metadata_size
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+
+class ShmRingBufferIO:
+
+    def __init__(self, buffer: ShmRingBuffer, reader_rank: int):
+        self.buffer = buffer
+        self.reader_rank = reader_rank
+        self._is_writer = self.reader_rank == -1
+        self._is_reader = not self._is_writer
+        if self._is_reader:
+            assert 0 <= self.reader_rank < buffer.n_reader, \
+                (f"Invalid reader rank {self.reader_rank} for buffer"
+                f" created with {buffer.n_reader} readers")
+        self.current_idx = 0
+
+    @contextmanager
+    def acquire_write(self):
+        assert self._is_writer, "Only writers can acquire write"
+        start_index = self.current_idx
+        start_time = time.time()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_count = sum(metadata_buffer[1:])
+                written_flag = metadata_buffer[0]
+                if written_flag and read_count != self.buffer.n_reader:
+                    # this block is written and not read by all readers
+                    # try to write to the next block
+                    self.current_idx = (self.current_idx +
+                                        1) % self.buffer.max_chunks
+                    if self.current_idx == start_index:
+                        # no empty block found
+                        if time.time(
+                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
+                            logger.warning(
+                                "No available block found in %s second. ",
+                                VLLM_RINGBUFFER_WARNING_INTERVAL)
+                            n_warning += 1
+                        # wait for a while (0.1 us)
+                        time.sleep(1e-7)
+                    continue
+                # found a block that is either
+                # (1) not written
+                # (2) read by all readers
+
+                # mark the block as not written
+                metadata_buffer[0] = 0
+                # let caller write to the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has written to the buffer
+                # mark the block as written
+                metadata_buffer[0] = 1
+                for i in range(1, self.buffer.n_reader + 1):
+                    # set read flag to 0, meaning it is not read yet
+                    metadata_buffer[i] = 0
+                break
+
+    @contextmanager
+    def acquire_read(self):
+        assert self._is_reader, "Only readers can acquire read"
+        start_index = self.current_idx
+        start_time = time.time()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_flag = metadata_buffer[self.reader_rank + 1]
+                written_flag = metadata_buffer[0]
+                if not written_flag or read_flag:
+                    # this block is either
+                    # (1) not written
+                    # (2) already read by this reader
+                    # try to read the next block
+                    self.current_idx = (self.current_idx +
+                                        1) % self.buffer.max_chunks
+                    if self.current_idx == start_index:
+                        # no block found
+                        if time.time(
+                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
+                            logger.warning(
+                                "No available block found in %s second. ",
+                                VLLM_RINGBUFFER_WARNING_INTERVAL)
+                            n_warning += 1
+                        # wait for a while (0.1 us)
+                        time.sleep(1e-7)
+                    continue
+                # found a block that is not read by this reader
+                # let caller read from the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has read from the buffer
+                # set the read flag
+                metadata_buffer[self.reader_rank + 1] = 1
+                break
+
+    def enqueue(self, obj):
+        assert self._is_writer, "Only writers can enqueue"
+        serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+        if len(serialized_obj) > self.buffer.max_chunk_bytes:
+            raise RuntimeError(
+                f"{len(serialized_obj)=} larger than the allowed value "
+                f"{self.buffer.max_chunk_bytes},"
+                "Please increase the max_chunk_bytes parameter.")
+        with self.acquire_write() as buf:
+            buf[:len(serialized_obj)] = serialized_obj
+
+    def dequeue(self):
+        assert self._is_reader, "Only readers can dequeue"
+        with self.acquire_read() as buf:
+            # no need to know the size of serialized object
+            # pickle format itself contains the size information internally
+            # see https://docs.python.org/3/library/pickle.html
+            obj = pickle.loads(buf)
+        return obj
+
+    def broadcast_object(self, obj=None):
+        if self._is_writer:
+            self.enqueue(obj)
+            return obj
+        else:
+            return self.dequeue()
+
+    def create_from_process_group(pg: ProcessGroup,
+                                  max_chunk_bytes,
+                                  max_chunks,
+                                  writer_rank=0) -> "ShmRingBufferIO":
+        group_rank = dist.get_rank(pg)
+        group_world_size = dist.get_world_size(pg)
+        ranks_inside_group = list(range(group_world_size))
+        global_ranks = dist.get_process_group_ranks(pg)
+        n_reader = group_world_size - 1
+        buffer: ShmRingBuffer
+        if group_rank == writer_rank:
+            buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks)
+            dist.broadcast_object_list([buffer], src=global_ranks[writer_rank])
+            dist.barrier(pg)
+            return ShmRingBufferIO(buffer, -1)
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, src=global_ranks[writer_rank])
+            dist.barrier(pg)
+            buffer = recv[0]  # type: ignore
+            rest_ranks = [r for r in ranks_inside_group if r != writer_rank]
+            return ShmRingBufferIO(buffer, rest_ranks.index(group_rank))
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 02b0dcbcb..5188fadbb 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -98,6 +98,7 @@ class GroupCoordinator:
     # communicators are only created for world size > 1
     pynccl_comm: Optional[Any]  # PyNccl communicator
     ca_comm: Optional[Any]  # Custom allreduce communicator
+    shm_broadcaster: Optional[Any]  # shared memory broadcaster
 
     def __init__(
         self,
@@ -162,6 +163,13 @@ class GroupCoordinator:
         else:
             self.ca_comm = None
 
+        from vllm.distributed.device_communicators.shm_broadcast import (
+            ShmRingBufferIO)
+        self.shm_broadcaster: Optional[ShmRingBufferIO] = None
+        if self.world_size > 1 and is_in_the_same_node(self.cpu_group):
+            self.shm_broadcaster = ShmRingBufferIO.create_from_process_group(
+                self.cpu_group, 1 << 20, 6)
+
     @property
     def first_rank(self):
         """Return the global rank of the first process in the group"""
@@ -324,6 +332,30 @@ class GroupCoordinator:
                                     group=self.device_group)
         return input_
 
+    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+        """Broadcast the input object.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj
+        if self.shm_broadcaster is not None:
+            assert src == 0, "Shared memory broadcaster only supports src=0"
+            return self.shm_broadcaster.broadcast_object(obj)
+        if self.rank_in_group == src:
+            torch.distributed.broadcast_object_list([obj],
+                                                    src=self.ranks[src],
+                                                    group=self.cpu_group)
+            return obj
+        else:
+            recv = [None]
+            torch.distributed.broadcast_object_list(recv,
+                                                    src=self.ranks[src],
+                                                    group=self.cpu_group)
+            return recv[0]
+
     def broadcast_object_list(self,
                               obj_list: List[Any],
                               src: int = 0,
@@ -371,9 +403,7 @@ class GroupCoordinator:
             # `metadata_list` lives in CPU memory.
             # `broadcast_object_list` has serialization & deserialization,
             # all happening on CPU. Therefore, we can use the CPU group.
-            torch.distributed.broadcast_object_list([metadata_list],
-                                                    src=src,
-                                                    group=metadata_group)
+            self.broadcast_object(metadata_list, src=src)
             async_handles = []
             for tensor in tensor_list:
                 if tensor.numel() == 0:
@@ -396,14 +426,10 @@ class GroupCoordinator:
                 async_handle.wait()
 
         else:
-            recv_metadata_list = [None]
-            torch.distributed.broadcast_object_list(recv_metadata_list,
-                                                    src=src,
-                                                    group=metadata_group)
-            assert recv_metadata_list[0] is not None
+            metadata_list = self.broadcast_object(None, src=src)
             tensor_dict = {}
             async_handles = []
-            for key, value in recv_metadata_list[0]:
+            for key, value in metadata_list:
                 if isinstance(value, TensorMetadata):
                     tensor = torch.empty(value.size,
                                          dtype=value.dtype,
diff --git a/vllm/envs.py b/vllm/envs.py
index ae2fcd082..49277e2d3 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -5,6 +5,7 @@ if TYPE_CHECKING:
     VLLM_HOST_IP: str = ""
     VLLM_PORT: Optional[int] = None
     VLLM_USE_MODELSCOPE: bool = False
+    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
     VLLM_INSTANCE_ID: Optional[str] = None
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
@@ -114,6 +115,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     "VLLM_INSTANCE_ID":
     lambda: os.environ.get("VLLM_INSTANCE_ID", None),
 
+    # Interval in seconds to log a warning message when the ring buffer is full
+    "VLLM_RINGBUFFER_WARNING_INTERVAL":
+    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
+
     # path to cudatoolkit home directory, under which should be bin, include,
     # and lib directories.
     "CUDA_HOME":
-- 
GitLab


From bd620b01fb74d5269ca6fc0fd32f66bfb205a358 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 20 Jun 2024 23:39:40 -0700
Subject: [PATCH 121/376] [Kernel][CPU] Add Quick `gelu` to CPU (#5717)

---
 csrc/cpu/activation.cpp                  | 19 +++++++++++++++++++
 csrc/cpu/torch_bindings.cpp              |  4 ++++
 vllm/_ipex_ops.py                        |  3 +++
 vllm/model_executor/layers/activation.py |  3 +++
 4 files changed, 29 insertions(+)

diff --git a/csrc/cpu/activation.cpp b/csrc/cpu/activation.cpp
index becd2ac42..039b8d5c3 100644
--- a/csrc/cpu/activation.cpp
+++ b/csrc/cpu/activation.cpp
@@ -59,6 +59,13 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
   return w3 * x * (ones + t);
 }
 
+FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) {
+  const vec_op::FP32Vec8 zeros(0.0);
+  const vec_op::FP32Vec8 ones(1.0);
+  const vec_op::FP32Vec8 w1(1.702f);
+  return x / (ones + (zeros - w1 * x).exp());
+}
+
 FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
   const vec_op::FP32Vec8 ones(1.0);
   const vec_op::FP32Vec8 w1(M_SQRT1_2);
@@ -142,3 +149,15 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
     CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
   });
 }
+
+void gelu_quick(torch::Tensor& out, torch::Tensor& input) {
+  int num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] {
+    CPU_KERNEL_GUARD_IN(gelu_quick_impl)
+    activation_kernel<scalar_t, gelu_quick_act, false>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(gelu_quick_impl)
+  });
+}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index a2bf0d49a..39e8cf3ed 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -58,6 +58,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
 
+  // Quick GELU implementation.
+  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_quick", torch::kCPU, &gelu_quick);
+
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 1e60e0848..99a875c9b 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -43,6 +43,9 @@ class ipex_ops:
     def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
         out.copy_(torch.nn.functional.gelu(x))
 
+    # TODO add implementation of gelu_quick here
+    # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+
     def paged_attention_v1(
         out: torch.Tensor,
         query: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 80cad15b4..5bfdba67b 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -155,6 +155,9 @@ class QuickGELU(CustomOp):
         ops.gelu_quick(out, x)
         return out
 
+    # TODO implement forward_xpu for QuickGELU
+    # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+
 
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
-- 
GitLab


From 5b15bde5399cbcb1052bfb49584f81ed300cd4ac Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 21 Jun 2024 12:44:29 -0400
Subject: [PATCH 122/376] [Doc] Documentation on supported hardware for
 quantization methods (#5745)

---
 docs/source/index.rst                         |  1 +
 docs/source/quantization/fp8.rst              |  4 ++-
 .../quantization/supported_hardware.rst       | 30 +++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/quantization/supported_hardware.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8795a865c..05133eb6d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -100,6 +100,7 @@ Documentation
    :maxdepth: 1
    :caption: Quantization
 
+   quantization/supported_hardware
    quantization/auto_awq
    quantization/fp8
    quantization/fp8_e5m2_kvcache
diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst
index 312a56459..09f313664 100644
--- a/docs/source/quantization/fp8.rst
+++ b/docs/source/quantization/fp8.rst
@@ -3,7 +3,9 @@
 FP8
 ==================
 
-vLLM supports FP8 (8-bit floating point) computation using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are supported. Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
+vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. 
+Currently, only Hopper and Ada Lovelace GPUs are supported. 
+Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
 
 Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127>`_.
 
diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst
new file mode 100644
index 000000000..df445e00a
--- /dev/null
+++ b/docs/source/quantization/supported_hardware.rst
@@ -0,0 +1,30 @@
+.. _supported_hardware_for_quantization:
+
+Supported Hardware for Quantization Kernels
+===========================================
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+Implementation  Volta   Turing   Ampere   Ada    Hopper  AMD GPU  Intel GPU  x86 CPU  AWS Inferentia  Google TPU
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+AQLM            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+AWQ             ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+DeepSpeedFP     ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+FP8             ❌      ❌       ❌       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+Marlin          ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+GPTQ            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+SqueezeLLM      ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+bitsandbytes    ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+
+Notes:
+^^^^^^
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- "✅" indicates that the quantization method is supported on the specified hardware.
+- "❌" indicates that the quantization method is not supported on the specified hardware.
+
+Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
\ No newline at end of file
-- 
GitLab


From f1e72cc19a21928400b63743d5fe164ec8ed30e8 Mon Sep 17 00:00:00 2001
From: zhyncs <me@zhyncs.com>
Date: Sat, 22 Jun 2024 03:15:48 +0800
Subject: [PATCH 123/376] [BugFix] exclude version 1.15.0 for modelscope
 (#5668)

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 5b3e682a8..d031d98c5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -172,7 +172,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer modelscope
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
-- 
GitLab


From 7187507301aa8361407e04be42d0d50680891493 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 21 Jun 2024 14:04:26 -0700
Subject: [PATCH 124/376] [ci][test] fix ca test in main (#5746)

---
 .buildkite/test-pipeline.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c337a81d4..0b87e6280 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -197,6 +197,9 @@ steps:
   gpu: a100
   num_gpus: 4
   commands: 
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
-- 
GitLab


From f5dda63eb5fcb5624b93fa5f09da01d5372bbce4 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Fri, 21 Jun 2024 15:42:46 -0700
Subject: [PATCH 125/376] [LoRA] Add support for pinning lora adapters in the
 LRU cache (#5603)

---
 tests/lora/test_lora_manager.py           | 64 +++++++++++++++++++++++
 vllm/engine/llm_engine.py                 |  3 ++
 vllm/executor/cpu_executor.py             |  3 ++
 vllm/executor/distributed_gpu_executor.py |  7 +++
 vllm/executor/executor_base.py            |  4 ++
 vllm/executor/gpu_executor.py             |  4 ++
 vllm/executor/neuron_executor.py          |  3 ++
 vllm/lora/models.py                       | 26 +++++++++
 vllm/lora/worker_manager.py               |  3 ++
 vllm/utils.py                             | 43 +++++++++++++--
 vllm/worker/model_runner.py               |  5 ++
 vllm/worker/worker.py                     |  3 ++
 vllm/worker/worker_base.py                |  8 +++
 13 files changed, 171 insertions(+), 5 deletions(-)

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 51a56b121..2133bce14 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -209,6 +209,34 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model):
     assert manager.activate_lora(3)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 3
+    assert manager.pin_lora(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+    assert manager.activate_lora(1)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.deactivate_lora(2)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.pin_lora(3)
+    assert manager.pin_lora(1)
+    with pytest.raises(RuntimeError):
+        assert manager.pin_lora(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    with pytest.raises(RuntimeError):
+        assert manager.activate_lora(2)
+
+    assert manager.deactivate_lora(3)
+    assert manager.pin_lora(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.remove_lora(3)
+    with pytest.raises(ValueError):
+        assert manager.pin_lora(3)
 
 
 def test_lru_lora_model_manager(dist_init, dummy_model):
@@ -288,6 +316,42 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
     assert set(manager.list_loras()) == set()
     assert all(x is None for x in manager.lora_index_to_id)
 
+    # pinning
+    assert manager.add_lora(model_lora3)
+    assert manager.activate_lora(3)
+    assert manager.add_lora(model_lora4)
+    assert manager.activate_lora(4)
+    assert set(manager.list_loras()) == {3, 4}
+    with pytest.raises(ValueError):
+        assert manager.pin_lora(1)
+    assert manager.pin_lora(3)
+    # Remove manually
+    assert manager.remove_lora(3)
+    assert not manager.remove_lora(3)
+
+    assert set(manager.list_loras()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.add_lora(model_lora1)
+    assert manager.pin_lora(1)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(2)
+
+    assert set(manager.list_loras()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.remove_oldest_lora()
+    assert set(manager.list_loras()) == {1}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] is None
+
+    with pytest.raises(RuntimeError):
+        assert manager.remove_oldest_lora()
+
+    assert set(manager.list_loras()) == {1}
+
 
 def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                                        sql_lora_files):
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 75d417f52..f7eae257f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1009,6 +1009,9 @@ class LLMEngine:
     def list_loras(self) -> Set[int]:
         return self.model_executor.list_loras()
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_executor.pin_lora(lora_id)
+
     def check_health(self) -> None:
         self.model_executor.check_health()
 
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index a2212459f..6137cecd8 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -84,6 +84,9 @@ class CPUExecutor(ExecutorBase):
     def remove_lora(self, lora_id: int) -> bool:
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
index f7c608af1..235b5bc47 100644
--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@@ -100,6 +100,13 @@ class DistributedGPUExecutor(GPUExecutor):
             lora_id=lora_id,
         )
 
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "pin_lora",
+            lora_id=lora_id,
+        )
+
     def list_loras(self) -> Set[int]:
         return self._run_workers("list_loras")
 
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 4d01939c2..7c2520b5a 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -86,6 +86,10 @@ class ExecutorBase(ABC):
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    @abstractmethod
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError  # type: ignore
+
     @abstractmethod
     def list_loras(self) -> Set[int]:
         raise NotImplementedError
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 3ad201f47..0a654200e 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -99,6 +99,10 @@ class GPUExecutor(ExecutorBase):
         assert lora_id > 0, "lora_id must be greater than 0."
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index e7f0e8879..c5e2fb0f6 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -65,6 +65,9 @@ class NeuronExecutor(ExecutorBase):
     def remove_lora(self, lora_id: int) -> bool:
         return self.driver_worker.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 3e8285686..afb9ba455 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -525,6 +525,12 @@ class LoRAModelManager:
             self.long_lora_context.offsets_by_lora_id.pop(lora_id, None)
         return bool(self._registered_loras.pop(lora_id, None))
 
+    def pin_lora(self, lora_id: int) -> bool:
+        """Pin a LoRAModel in the manager cache."""
+        raise NotImplementedError(
+            "Pinning is not supported in LoRAModelManager."
+            "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
+
     # TODO see if this can be vectorized
     def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         (base_indices, sampler_indices, sampler_indices_padded,
@@ -777,6 +783,26 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
             return True
         return False
 
+    def pin_lora(self, lora_id: int) -> bool:
+        """Pin a LoRAModel in the manager cache."""
+        self._pin_lora_in_cpu_cache(lora_id)
+        self._pin_lora_in_gpu_cache(lora_id)
+        return True
+
+    def _pin_lora_in_cpu_cache(self, lora_id: int):
+        try:
+            self._registered_loras.pin(lora_id)
+        except ValueError as err:
+            raise ValueError("Pinning failed. "
+                             f"LoRA {lora_id} is not registered.") from err
+
+    def _pin_lora_in_gpu_cache(self, lora_id: int):
+        if lora_id not in self._active_loras:
+            # move lora to gpu if not already active
+            self.activate_lora(lora_id)
+
+        self._active_loras.pin(lora_id)
+
 
 def create_lora_manager(
         model: nn.Module,
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 498b2b9dd..ca4903c23 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -221,6 +221,9 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
     def remove_lora(self, lora_id: int) -> bool:
         return self._lora_manager.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self._lora_manager.pin_lora(lora_id)
+
     def remove_all_loras(self):
         self._lora_manager.remove_all_loras()
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 27a7b1042..ce5c377ef 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -15,7 +15,7 @@ from collections import defaultdict
 from functools import lru_cache, partial, wraps
 from platform import uname
 from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
-                    Hashable, List, Optional, OrderedDict, Tuple, TypeVar,
+                    Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
                     Union)
 
 import numpy as np
@@ -44,6 +44,13 @@ K = TypeVar("K")
 T = TypeVar("T")
 
 
+class _Sentinel:
+    ...
+
+
+ALL_PINNED_SENTINEL = _Sentinel()
+
+
 class Device(enum.Enum):
     GPU = enum.auto()
     CPU = enum.auto()
@@ -67,6 +74,7 @@ class LRUCache(Generic[T]):
 
     def __init__(self, capacity: int):
         self.cache: OrderedDict[Hashable, T] = OrderedDict()
+        self.pinned_items: Set[Hashable] = set()
         self.capacity = capacity
 
     def __contains__(self, key: Hashable) -> bool:
@@ -102,14 +110,36 @@ class LRUCache(Generic[T]):
         self.cache.move_to_end(key)
         self._remove_old_if_needed()
 
+    def pin(self, key: Hashable) -> None:
+        """
+        Pins a key in the cache preventing it from being
+        evicted in the LRU order.
+        """
+        if key not in self.cache:
+            raise ValueError(f"Cannot pin key: {key} not in cache.")
+        self.pinned_items.add(key)
+
+    def _unpin(self, key: Hashable) -> None:
+        self.pinned_items.remove(key)
+
     def _on_remove(self, key: Hashable, value: Optional[T]):
         pass
 
-    def remove_oldest(self):
+    def remove_oldest(self, remove_pinned=False):
         if not self.cache:
             return
-        key, value = self.cache.popitem(last=False)
-        self._on_remove(key, value)
+
+        if not remove_pinned:
+            # pop the oldest item in the cache that is not pinned
+            lru_key = next(
+                (key for key in self.cache if key not in self.pinned_items),
+                ALL_PINNED_SENTINEL)
+            if lru_key is ALL_PINNED_SENTINEL:
+                raise RuntimeError("All items are pinned, "
+                                   "cannot remove oldest from the cache.")
+        else:
+            lru_key = next(iter(self.cache))
+        self.pop(lru_key)
 
     def _remove_old_if_needed(self) -> None:
         while len(self.cache) > self.capacity:
@@ -120,13 +150,16 @@ class LRUCache(Generic[T]):
             default_value: Optional[T] = None) -> Optional[T]:
         run_on_remove = key in self.cache
         value: Optional[T] = self.cache.pop(key, default_value)
+        # remove from pinned items
+        if key in self.pinned_items:
+            self._unpin(key)
         if run_on_remove:
             self._on_remove(key, value)
         return value
 
     def clear(self):
         while len(self.cache) > 0:
-            self.remove_oldest()
+            self.remove_oldest(remove_pinned=True)
         self.cache.clear()
 
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index e24835a1e..a321eafce 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -878,6 +878,11 @@ class ModelRunner:
             raise RuntimeError("LoRA is not enabled.")
         return self.lora_manager.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index e334ffbb7..c60764ef1 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -333,6 +333,9 @@ class Worker(WorkerBase):
     def remove_lora(self, lora_id: int) -> bool:
         return self.model_runner.remove_lora(lora_id)
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
     def list_loras(self) -> Set[int]:
         return self.model_runner.list_loras()
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 3d52fd71e..dc09718de 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -70,6 +70,10 @@ class WorkerBase(ABC):
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    @abstractmethod
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
     @abstractmethod
     def list_loras(self) -> Set[int]:
         raise NotImplementedError
@@ -86,6 +90,10 @@ class LoraNotSupportedWorkerBase(WorkerBase):
     def remove_lora(self, lora_id: int) -> bool:
         raise ValueError(f"{type(self)} does not support LoRA")
 
+    def pin_lora(self, lora_id: int) -> bool:
+        return ValueError(
+            f"{type(self)} does not support LoRA")  # type: ignore
+
     def list_loras(self) -> Set[int]:
         raise ValueError(f"{type(self)} does not support LoRA")
 
-- 
GitLab


From cf90ae01237018f70573f69c599d26648ff7740b Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 22 Jun 2024 08:09:34 +0800
Subject: [PATCH 126/376] [CI][Hardware][Intel GPU] add Intel GPU(XPU) ci
 pipeline (#5616)

---
 .buildkite/test-template-aws.j2 | 10 ++++++++--
 README.md                       |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index fb34b787e..1a7fb44c2 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -42,12 +42,18 @@ steps:
     command: bash .buildkite/run-neuron-test.sh
     soft_fail: false
 
-  - label: "Intel Test"
+  - label: "Intel CPU Test"
     depends_on: ~
     agents:
-      queue: intel
+      queue: intel-cpu
     command: bash .buildkite/run-cpu-test.sh
 
+  - label: "Intel GPU Test"
+    depends_on: ~
+    agents:
+      queue: intel-gpu
+    command: bash .buildkite/run-xpu-test.sh
+
   {% for step in steps %}
   {% if step.gpu == "a100" %}
   - label: "{{ step.label }}"
diff --git a/README.md b/README.md
index c24768bf7..3e0da945d 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs
+- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support
 
-- 
GitLab


From 9c62db07ed8ee28d9f1a0e6ac215446d49532008 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Sat, 22 Jun 2024 10:07:08 +0800
Subject: [PATCH 127/376] [Model] Support Qwen-VL and Qwen-VL-Chat models with
 text-only inputs (#5710)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/qwen.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index d22ea6b79..b6ea6ab39 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -28,6 +28,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
+from vllm.utils import print_warning_once
 
 
 class QWenMLP(nn.Module):
@@ -288,6 +289,15 @@ class QWenLMHeadModel(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip loading visual weights to support Qwen-VL models
+                # in cases with text-only inputs
+                # TODO: add support for Qwen-VL
+                if (name not in params_dict
+                        and name.startswith("transformer.visual.")):
+                    print_warning_once(
+                        "Only text inputs are allowed. Images won't be handled "
+                        "until Qwen-VL models are fully supported.")
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-- 
GitLab


From ff9ddbceee63efba6ba1f8d4dc66a92f1191da04 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Fri, 21 Jun 2024 20:33:12 -0700
Subject: [PATCH 128/376] [Misc] Remove #4789 workaround left in
 vllm/entrypoints/openai/run_batch.py (#5756)

---
 vllm/entrypoints/openai/run_batch.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 488ac8971..dac6c2b4c 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,5 +1,4 @@
 import asyncio
-import sys
 from io import StringIO
 from typing import Awaitable, List
 
@@ -137,9 +136,6 @@ async def main(args):
     output_buffer.seek(0)
     await write_file(args.output_file, output_buffer.read().strip())
 
-    # Temporary workaround for https://github.com/vllm-project/vllm/issues/4789
-    sys.exit(0)
-
 
 if __name__ == "__main__":
     args = parse_args()
-- 
GitLab


From 0cbc1d2b4ff9e3afa32ffd2d5d308c136c2d15e3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Jun 2024 22:25:14 -0700
Subject: [PATCH 129/376] [Bugfix] Fix pin_lora error in TPU executor (#5760)

---
 vllm/executor/tpu_executor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 7061ad85f..5ed00e137 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -82,6 +82,9 @@ class TPUExecutor(ExecutorBase):
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError("LoRA is not implemented for TPU backend.")
 
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
     def list_loras(self) -> Set[int]:
         raise NotImplementedError("LoRA is not implemented for TPU backend.")
 
-- 
GitLab


From 8c00f9c15d13aed34b129b31c32a227be230e218 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Jun 2024 23:09:40 -0700
Subject: [PATCH 130/376] [Docs][TPU] Add installation tip for TPU (#5761)

---
 .../getting_started/tpu-installation.rst       | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 3627600e1..e96aabbb6 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -73,3 +73,21 @@ Next, build vLLM from source. This will only take a few seconds:
 .. code-block:: console
 
     $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
+
+
+.. tip::
+
+    If you encounter the following error:
+
+    .. code-block:: console
+
+        from torch._C import *  # noqa: F403
+        ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
+
+
+    You can install OpenBLAS with the following command:
+
+    .. code-block:: console
+
+        $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+
-- 
GitLab


From 832ea88fcb4819037b685fb47b3a0de37f2804d3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 22 Jun 2024 10:00:43 -0700
Subject: [PATCH 131/376] [core][distributed] improve shared memory broadcast
 (#5754)

---
 .../device_communicators/shm_broadcast.py     | 42 ++++++++++++++-----
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 119befcf6..c44bd2f11 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -48,6 +48,26 @@ class ShmRingBuffer:
         | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
         +--------------+--------------+--------------+-----+--------------+
 
+        The state of metadata is as follows:
+
+        (case 1) 0???...???: the block is not written yet, cannot read, can write
+        (case 2) 1000...000: the block is just written, can read, cannot write
+        (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write
+        (case 4) 1111...111: the block is written and read by all readers, cannot read, can write
+
+        State transition for readers:
+
+        When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read.
+        Only after the caller finishes reading the block, the reader can mark the block as read.
+        Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0).
+
+        State transition for writer:
+
+        When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case
+        to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer
+        can reset the reader flags to 0, and mark the block as written (from 0 to 1).
+        NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
+
         During creation, `name` is None and the buffer is created. We can pass the
         created object to other processes by pickling it. The other processes will
         get the name of the shared memory and open it, so that they can access the
@@ -81,10 +101,6 @@ class ShmRingBuffer:
                        lambda *args, **kwargs: None):
                 self.shared_memory = shared_memory.SharedMemory(name=name)
             assert self.shared_memory.size == self.total_bytes_of_buffer
-            with memoryview(self.shared_memory.buf[self.metadata_offset:]
-                            ) as metadata_buffer:
-                tensor = torch.frombuffer(metadata_buffer, dtype=torch.uint8)
-                assert torch.all(tensor == 0)
 
     def __reduce__(self):
         return (
@@ -163,11 +179,15 @@ class ShmRingBufferIO:
                     yield buf
 
                 # caller has written to the buffer
-                # mark the block as written
-                metadata_buffer[0] = 1
+                # NOTE: order is important here
+                # first set the read flags to 0
+                # then set the written flag to 1
+                # otherwise, the readers may think they already read the block
                 for i in range(1, self.buffer.n_reader + 1):
                     # set read flag to 0, meaning it is not read yet
                     metadata_buffer[i] = 0
+                # mark the block as written
+                metadata_buffer[0] = 1
                 break
 
     @contextmanager
@@ -247,13 +267,15 @@ class ShmRingBufferIO:
         buffer: ShmRingBuffer
         if group_rank == writer_rank:
             buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks)
-            dist.broadcast_object_list([buffer], src=global_ranks[writer_rank])
-            dist.barrier(pg)
+            dist.broadcast_object_list([buffer],
+                                       src=global_ranks[writer_rank],
+                                       group=pg)
             return ShmRingBufferIO(buffer, -1)
         else:
             recv = [None]
-            dist.broadcast_object_list(recv, src=global_ranks[writer_rank])
-            dist.barrier(pg)
+            dist.broadcast_object_list(recv,
+                                       src=global_ranks[writer_rank],
+                                       group=pg)
             buffer = recv[0]  # type: ignore
             rest_ranks = [r for r in ranks_inside_group if r != writer_rank]
             return ShmRingBufferIO(buffer, rest_ranks.index(group_rank))
-- 
GitLab


From 6c916ac8a80d1b2f4e0d0113a67767dc254a3598 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 24 Jun 2024 02:37:11 +0530
Subject: [PATCH 132/376] [BugFix] [Kernel] Add Cutlass2x fallback kernels
 (#5744)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 csrc/quantization/cutlass_w8a8/common.hpp     |  8 +++
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 52 ++++++++++++++++---
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
index 23d0587bb..bf04bb400 100644
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ b/csrc/quantization/cutlass_w8a8/common.hpp
@@ -17,3 +17,11 @@ inline uint32_t next_pow_2(uint32_t const num) {
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 740b9fb64..38a20a172 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -250,12 +250,39 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                  torch::Tensor const& b,
+                                  EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_default {
   // This config is used in 2 cases,
   //  - M in (128, inf)
   //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -271,6 +298,7 @@ struct sm80_config_M64 {
   // This config is used in 2 cases,
   // - M in (32, 64]
   // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -284,6 +312,7 @@ template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_M32 {
   // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
@@ -297,6 +326,7 @@ template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
 struct sm80_config_M16 {
   // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
   static_assert(std::is_same<InType, int8_t>());
   using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
   using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
@@ -331,35 +361,45 @@ void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
   using Cutlass2xGemmM16 =
       typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
 
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
       std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
   if (mp2 <= 16) {
     // M in [1, 16]
-    return cutlass_gemm_caller<Cutlass2xGemmM16>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 32) {
     // M in (16, 32]
-    return cutlass_gemm_caller<Cutlass2xGemmM32>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 64) {
     // M in (32, 64]
-    return cutlass_gemm_caller<Cutlass2xGemmM64>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // M in (64, 128]
     uint32_t const n = out.size(1);
     bool const small_n = n < 8192;
     if (small_n) {
-      return cutlass_gemm_caller<Cutlass2xGemmM128SmallN>(
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
           out, a, b, std::forward<EpilogueArgs>(args)...);
     } else {
-      return cutlass_gemm_caller<Cutlass2xGemmM128BigN>(
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
           out, a, b, std::forward<EpilogueArgs>(args)...);
     }
   } else {
     // M in (128, inf)
-    return cutlass_gemm_caller<Cutlass2xGemmDefault>(
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   }
 }
-- 
GitLab


From 5d4d90536fa24c032bb91ae629b7b4958e045b03 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Sun, 23 Jun 2024 17:42:28 -0400
Subject: [PATCH 133/376] [Distributed] Add send and recv helpers (#5719)

---
 tests/distributed/test_comm_ops.py            |  78 +++++++-
 tests/distributed/test_custom_all_reduce.py   |   5 +-
 tests/distributed/test_pynccl.py              |  16 +-
 tests/utils.py                                |   2 +-
 .../device_communicators/pynccl.py            |  14 +-
 vllm/distributed/parallel_state.py            | 187 ++++++++++++++++++
 6 files changed, 278 insertions(+), 24 deletions(-)

diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 53654dc40..bf0f31df0 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -8,12 +8,11 @@ import pytest
 import ray
 import torch
 
-from vllm.distributed import (broadcast_tensor_dict,
+from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 
-from ..utils import (init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+from ..utils import init_test_distributed_environment, multi_process_parallel
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -105,6 +104,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
         assert torch.allclose(recv_dict["f"], test_dict["f"])
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                      distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if not get_pp_group().is_first_rank:
+        recv_dict = get_pp_group().recv_tensor_dict()
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send_tensor_dict(test_dict)
+
+    if not get_pp_group().is_first_rank:
+        assert len(recv_dict) == len(test_dict)
+        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        assert torch.allclose(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+                          distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not get_pp_group().is_first_rank:
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send(test_tensor)
+
+    if not get_pp_group().is_first_rank:
+        assert torch.allclose(test_tensor, recv_tensor)
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tp_size", [2])
@@ -113,4 +174,13 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
     broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_tensor_parallel(tp_size, 1, test_target)
+    multi_process_parallel(tp_size, 1, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+def test_multi_process_pipeline_parallel(pp_size, test_target):
+    multi_process_parallel(1, pp_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 9a39160b8..3c281a45f 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -12,8 +12,7 @@ from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
                                              get_tp_group, graph_capture)
 
 from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+                     init_test_distributed_environment, multi_process_parallel)
 
 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -113,4 +112,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 964dbc542..e0e424439 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -168,9 +168,13 @@ def send_recv_worker_fn():
                              dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
         if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     assert result == 1
 
@@ -203,9 +207,13 @@ def multiple_send_recv_worker_fn():
                              device=device)
     with pynccl_comm.change_state(enable=True):
         if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     if torch.distributed.get_rank() in [0, 2]:
         assert result == 1
diff --git a/tests/utils.py b/tests/utils.py
index bc30515c8..174efca4a 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -129,7 +129,7 @@ def init_test_distributed_environment(
     ensure_model_parallel_initialized(tp_size, pp_size)
 
 
-def multi_process_tensor_parallel(
+def multi_process_parallel(
     tp_size: int,
     pp_size: int,
     test_target,
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 83eec264b..731956654 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -121,10 +121,7 @@ class PyNcclCommunicator:
                                 ncclRedOpTypeEnum.from_torch(op), self.comm,
                                 cudaStream_t(stream.cuda_stream))
 
-    def send(self,
-             tensor: torch.Tensor,
-             dst: Optional[int] = None,
-             stream=None):
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
@@ -132,16 +129,11 @@ class PyNcclCommunicator:
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if dst is None:
-            dst = (self.rank + 1) % self.world_size
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
 
-    def recv(self,
-             tensor: torch.Tensor,
-             src: Optional[int] = None,
-             stream=None):
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
@@ -149,8 +141,6 @@ class PyNcclCommunicator:
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if src is None:
-            src = (self.rank - 1) % self.world_size
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5188fadbb..5f1decb37 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -20,6 +20,7 @@ If you only need to use the distributed environment without model/pipeline
  steps.
 """
 import contextlib
+import pickle
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
@@ -28,6 +29,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
+import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
 import vllm.envs as envs
@@ -180,6 +182,16 @@ class GroupCoordinator:
         """Return the global rank of the last process in the group"""
         return self.ranks[-1]
 
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+
     @property
     def next_rank(self):
         """Return the global rank of the process that follows the caller"""
@@ -374,6 +386,70 @@ class GroupCoordinator:
                                                 group=self.device_group)
         return obj_list
 
+    def send_object(self, obj: Any, dst: int) -> None:
+        """Send the input object list to the destination rank."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert dst != self.rank, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank.")
+
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
+
+        size_tensor = torch.tensor([object_tensor.numel()],
+                                   dtype=torch.long,
+                                   device="cpu")
+
+        # Send object size
+
+        torch.distributed.send(size_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        # Send object
+        torch.distributed.send(object_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        return None
+
+    def recv_object(self, src: int) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        assert src != self.rank, (
+            "Invalid source rank. Source rank is the same as the current rank."
+        )
+
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+
+        # Receive object size
+        rank_size = torch.distributed.recv(size_tensor,
+                                           src=src,
+                                           group=self.cpu_group)
+
+        # Tensor to receive serialized objects into.
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device="cpu")
+
+        rank_object = torch.distributed.recv(object_tensor,
+                                             src=src,
+                                             group=self.cpu_group)
+
+        assert rank_object == rank_size, (
+            "Received object sender rank does not match the size sender rank.")
+
+        obj = pickle.loads(object_tensor.numpy().tobytes())
+
+        return obj
+
     def broadcast_tensor_dict(
         self,
         tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
@@ -459,6 +535,88 @@ class GroupCoordinator:
                 async_handle.wait()
         return tensor_dict
 
+    def send_tensor_dict(
+        self,
+        tensor_dict: Dict[Any, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None
+    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if dst is None:
+            dst = self.next_rank
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        metadata_list: List[Tuple[Any, Any]] = []
+        assert isinstance(
+            tensor_dict,
+            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # `metadata_list` lives in CPU memory.
+        # `send_object_list` has serialization & deserialization,
+        # all happening on CPU. Therefore, we can use the CPU group.
+        self.send_object(metadata_list, dst=dst)
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+            if tensor.is_cpu:
+                # use metadata_group for CPU tensors
+                torch.distributed.send(tensor, dst=dst, group=metadata_group)
+            else:
+                # use group for GPU tensors
+                torch.distributed.send(tensor, dst=dst, group=group)
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None
+    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if src is None:
+            src = self.prev_rank
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size,
+                                     dtype=value.dtype,
+                                     device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    tensor_dict[key] = tensor
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=src,
+                                           group=metadata_group)
+                else:
+                    # use group for GPU tensors
+                    torch.distributed.recv(tensor, src=src, group=group)
+                tensor_dict[key] = tensor
+            else:
+                tensor_dict[key] = value
+        return tensor_dict
+
     def barrier(self):
         """Barrier synchronization among the group.
         NOTE: don't use `device_group` here! `barrier` in NCCL is
@@ -468,6 +626,35 @@ class GroupCoordinator:
         """
         torch.distributed.barrier(group=self.cpu_group)
 
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = self.next_rank
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank."""
+        """NOTE: `src` is the local rank of the destination rank."""
+        if src is None:
+            src = self.prev_rank
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
     def destroy(self):
         if self.device_group is not None:
             torch.distributed.destroy_process_group(self.device_group)
-- 
GitLab


From edd5fe5fa29b8f9cc5fa37a30cc7211e0ff37067 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 24 Jun 2024 12:11:53 +0800
Subject: [PATCH 134/376] [Bugfix] Add phi3v resize for dynamic shape and fix
 torchvision requirement (#5772)

---
 requirements-cpu.txt                |  1 +
 requirements-cuda.txt               |  2 +
 requirements-test.txt               |  1 -
 tests/models/test_phi3v.py          |  4 ++
 vllm/model_executor/models/phi3v.py | 69 +++++++++++++++++++++++++++--
 5 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 8b7d86e68..21acee91d 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,4 +3,5 @@
 
 # Dependencies for x86_64 CPUs
 torch == 2.3.1+cpu
+torchvision == 0.18.1+cpu   # required for the image processor of phi3v, this must be updated alongside torch
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 353617983..10596ed85 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,5 +5,7 @@
 ray >= 2.9
 nvidia-ml-py # for pynvml package
 torch == 2.3.0
+# These must be updated alongside torch
+torchvision == 0.18.0   # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
 vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
diff --git a/requirements-test.txt b/requirements-test.txt
index fef0ede7b..8b68e0e93 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,7 +14,6 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
-torchvision # required for the image processor of phi3v
 
 # Benchmarking
 aiohttp
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 234547598..a29d50df4 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -22,6 +22,7 @@ assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
 def iter_phi3v_configs(model_name: str):
     image_hw_to_feature_size = {
         (1008, 1344): 1921,
+        (2016, 2688): 1933,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -75,6 +76,9 @@ if is_cpu():
 # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 # Since we use _attn_implementation="eager" for hf_runner, here is
 # numeric difference for longer context and test can't pass
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fa20a7c59..dac832a68 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -13,14 +13,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
 
+import numpy as np
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -32,9 +35,11 @@ from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
+from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
+logger = init_logger(__name__)
+
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }
@@ -268,7 +273,63 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
+# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported
+# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def calc_padded_size(width, height, padding_unit=336):
+    target_height = int(np.ceil(height / padding_unit) * padding_unit)
+    top_padding = int((target_height - height) / 2)
+    bottom_padding = target_height - height - top_padding
+    padded_width = width
+    padded_height = height + top_padding + bottom_padding
+    return padded_width, padded_height
+
+
+# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def calc_hd_transform_size(width, height, hd_num=16):
+    transposed = False
+    if width < height:
+        width, height = height, width
+        transposed = True
+
+    ratio = width / height
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= hd_num:
+        scale += 1
+    scale -= 1
+
+    new_width = int(scale * 336)
+    new_height = int(new_width / ratio)
+
+    padded_width, padded_height = calc_padded_size(new_width, new_height)
+
+    if transposed:
+        padded_width, padded_height = padded_height, padded_width
+
+    return padded_width, padded_height
+
+
+def _image_processor(
+    data: ImagePixelData,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Dict[str, torch.Tensor]:
+    image = data.image
+
+    if isinstance(image, Image.Image):
+        # Temporary patch before dynamic number of image tokens is supported
+        _, _, h, w = vlm_config.image_input_shape
+        if (w, h) != calc_hd_transform_size(image.width, image.height):
+            logger.warning(
+                "Dynamic image shape is currently not supported. "
+                "Resizing input image to (%d, %d).", w, h)
+
+            data.image = image.resize((w, h))
+
+    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
+            ._default_input_processor(data, model_config, vlm_config)
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor)
 @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class Phi3VForCausalLM(VisionLanguageModelBase):
 
-- 
GitLab


From c2462129521a64b62ace77b28641d2e3bec5831c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 24 Jun 2024 00:37:42 -0700
Subject: [PATCH 135/376] [doc][faq] add warning to download models for every
 nodes (#5783)

---
 docs/source/serving/distributed_serving.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index b0c45dbf7..2a7937a91 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -35,4 +35,7 @@ To scale vLLM beyond a single machine, install and start a `Ray runtime <https:/
     $ # On worker nodes
     $ ray start --address=<ray-head-address>
 
-After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
\ No newline at end of file
+After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
+
+.. warning::
+    Please make sure you downloaded the model to all the nodes, or the model is downloaded to some distributed file system that is accessible by all nodes.
-- 
GitLab


From e72dc6cb3507d914eec8dfd0d5c7b9478f6a8ccc Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 24 Jun 2024 13:26:17 -0400
Subject: [PATCH 136/376] [Doc] Add "Suggest edit" button to doc pages (#5789)

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index ca26dcec4..af1f22b23 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -66,6 +66,7 @@ html_theme_options = {
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',
     'use_repository_button': True,
+    'use_edit_page_button': True,
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
-- 
GitLab


From 1744cc99ba9bdefea8f3f798cf51ed650b81a98e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 24 Jun 2024 13:48:55 -0400
Subject: [PATCH 137/376] [Doc] Add Phi-3-medium to list of supported models
 (#5788)

---
 docs/source/models/supported_models.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f4673dc27..47737ae52 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -129,7 +129,7 @@ Alongside each architecture, we include some popular models that use it.
     - ✅︎
   * - :code:`Phi3ForCausalLM`
     - Phi-3
-    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
+    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc.
     -
   * - :code:`Phi3SmallForCausalLM`
     - Phi-3-Small
-- 
GitLab


From ba991d5c84adbc0685075af88333c688ddb06011 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Mon, 24 Jun 2024 16:01:19 -0700
Subject: [PATCH 138/376] [Bugfix] Fix FlexibleArgumentParser replaces _ with -
 for actual args (#5795)

---
 vllm/utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index ce5c377ef..f0c7df5cf 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -822,7 +822,13 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
         processed_args = []
         for arg in args:
             if arg.startswith('--'):
-                processed_args.append('--' + arg[len('--'):].replace('_', '-'))
+                if '=' in arg:
+                    key, value = arg.split('=', 1)
+                    key = '--' + key[len('--'):].replace('_', '-')
+                    processed_args.append(f'{key}={value}')
+                else:
+                    processed_args.append('--' +
+                                          arg[len('--'):].replace('_', '-'))
             else:
                 processed_args.append(arg)
 
-- 
GitLab


From e9de9dd551ac595a9f3825fcd1507deceef4f332 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 24 Jun 2024 21:09:02 -0700
Subject: [PATCH 139/376] [ci] Remove aws template (#5757)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml   |   7 +-
 .buildkite/test-template-aws.j2 | 145 --------------------------------
 2 files changed, 5 insertions(+), 147 deletions(-)
 delete mode 100644 .buildkite/test-template-aws.j2

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0b87e6280..19b1bce16 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1,7 +1,10 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template-aws.j2` to generate
-# the final pipeline yaml file.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
+# to generate the final pipeline yaml file.
+
 
 steps:
 - label: Regression Test
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
deleted file mode 100644
index 1a7fb44c2..000000000
--- a/.buildkite/test-template-aws.j2
+++ /dev/null
@@ -1,145 +0,0 @@
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        agents:
-          queue: amd
-        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-        priority: 100
-        soft_fail: true
-    {% endif %}
-    {% endfor %}
-
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: false
-
-  - label: "Intel CPU Test"
-    depends_on: ~
-    agents:
-      queue: intel-cpu
-    command: bash .buildkite/run-cpu-test.sh
-
-  - label: "Intel GPU Test"
-    depends_on: ~
-    agents:
-      queue: intel-gpu
-    command: bash .buildkite/run-xpu-test.sh
-
-  {% for step in steps %}
-  {% if step.gpu == "a100" %}
-  - label: "{{ step.label }}"
-    agents:
-      queue: a100-queue
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: ci
-          containers:
-          - image: {{ docker_image }}
-            command: ["bash"]
-            args:
-            - '-c'
-            - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
-            resources:
-              limits:
-                nvidia.com/gpu: {{ step.num_gpus or 1 }}
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  {% else %}
-  - label: "{{ step.label }}"
-    agents:
-      {% if step.label == "Documentation Build" %}
-      queue: small_cpu_queue
-      {% elif step.no_gpu %}
-      queue: cpu_queue
-      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
-      queue: gpu_4_queue
-      {% else %}
-      queue: gpu_1_queue
-      {% endif %}
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - docker#v5.2.0:
-          image: {{ docker_image }}
-          always-pull: true
-          propagate-environment: true
-          {% if not step.no_gpu %}
-          gpus: all
-          {% endif %}
-          {% if step.label == "Benchmarks" %}
-          mount-buildkite-agent: true
-          {% endif %}
-          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
-          environment:
-            - VLLM_USAGE_SOURCE=ci-test
-            - HF_TOKEN
-            {% if step.label == "Speculative decoding tests" %}
-            - VLLM_ATTENTION_BACKEND=XFORMERS
-            {% endif %}
-          volumes:
-            - /dev/shm:/dev/shm
-  {% endif %}
-  {% endfor %}
-- 
GitLab


From f23871e9eead900d6146961ca894f5bc91f30f5e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 16:25:03 +0800
Subject: [PATCH 140/376] [Doc] Add notice about breaking changes to VLMs
 (#5818)

---
 docs/source/models/vlm.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 70ac82e20..de55a1a09 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -5,6 +5,9 @@ Using VLMs
 
 vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
 
+.. important::
+    We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
+
 Engine Arguments
 ----------------
 
@@ -39,6 +42,10 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
         image_feature_size=576,
     )
 
+.. important::
+    We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
+
+
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
@@ -63,6 +70,9 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
 
+.. important::
+    We will remove the need to format image tokens in a future release. Afterwards, the input text will follow the same format as that for the original HuggingFace model.
+
 Online OpenAI Vision API Compatible Inference
 ----------------------------------------------
 
@@ -89,6 +99,9 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
         --image-feature-size 576 \
         --chat-template template_llava.jinja
 
+.. important::
+    We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
+
 To consume the server, you can use the OpenAI client like in the example below:
 
 .. code-block:: python
-- 
GitLab


From 2ce5d6688bae64e467640b05e73af2888e93afcf Mon Sep 17 00:00:00 2001
From: Woo-Yeon Lee <wooyeonlee0@gmail.com>
Date: Tue, 25 Jun 2024 18:56:06 +0900
Subject: [PATCH 141/376]  [Speculative Decoding] Support draft model on
 different tensor-parallel size than target model (#5414)

---
 .buildkite/test-pipeline.yaml                 |   3 +-
 benchmarks/benchmark_latency.py               |   6 +
 .../e2e/test_integration_dist_tp2.py          | 111 +++++++++++++
 ...n_dist.py => test_integration_dist_tp4.py} |  41 +++--
 vllm/config.py                                |  24 ++-
 vllm/distributed/parallel_state.py            |  76 ++++++---
 vllm/engine/arg_utils.py                      |  10 ++
 vllm/spec_decode/multi_step_worker.py         |  11 +-
 vllm/spec_decode/proposer_worker_base.py      |   4 +-
 .../spec_decode/smaller_tp_proposer_worker.py | 149 ++++++++++++++++++
 vllm/spec_decode/spec_decode_worker.py        |  12 +-
 11 files changed, 388 insertions(+), 59 deletions(-)
 create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py
 rename tests/spec_decode/e2e/{test_integration_dist.py => test_integration_dist_tp4.py} (62%)
 create mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 19b1bce16..10cfe35d8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -54,7 +54,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
@@ -71,6 +71,7 @@ steps:
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index a4cf0632b..f3d00e456 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -25,6 +25,8 @@ def main(args: argparse.Namespace):
         model=args.model,
         speculative_model=args.speculative_model,
         num_speculative_tokens=args.num_speculative_tokens,
+        speculative_draft_tensor_parallel_size=\
+            args.speculative_draft_tensor_parallel_size,
         tokenizer=args.tokenizer,
         quantization=args.quantization,
         tensor_parallel_size=args.tensor_parallel_size,
@@ -127,6 +129,10 @@ if __name__ == '__main__':
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
     parser.add_argument('--speculative-model', type=str, default=None)
     parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--speculative-draft-tensor-parallel-size',
+                        '-spec-draft-tp',
+                        type=int,
+                        default=None)
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
new file mode 100644
index 000000000..5534b80c0
--- /dev/null
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -0,0 +1,111 @@
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from vllm.utils import is_hip
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+    },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
+                              batch_size: int, output_len: int):
+    """Verify greedy equality when tensor parallelism is used.
+    """
+    if is_hip():
+        pytest.skip("hip is not well-supported yet")
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "speculative_draft_tensor_parallel_size": 1,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
similarity index 62%
rename from tests/spec_decode/e2e/test_integration_dist.py
rename to tests/spec_decode/e2e/test_integration_dist_tp4.py
index d444ef24c..56cb0147d 100644
--- a/tests/spec_decode/e2e/test_integration_dist.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -5,16 +5,16 @@ tensor parallelism.
 import pytest
 import torch
 
-from vllm.utils import is_hip
-
 from .conftest import run_greedy_equality_correctness_test
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
         "model": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
@@ -22,7 +22,7 @@ from .conftest import run_greedy_equality_correctness_test
 
         # Required for spec decode.
         "use_v2_block_manager": True,
-        "tensor_parallel_size": 2,
+        "tensor_parallel_size": 4,
 
         # Use AsyncLLM engine, so that the engine runs in its own process.
         # Otherwise, since vLLM does not follow true SPMD, the test runner
@@ -31,35 +31,30 @@ from .conftest import run_greedy_equality_correctness_test
         # second run of the test to fail with internal NCCL error.
         "use_async": True,
     }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
     {
         "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-    },
-    {
-        "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
     },
 ])
-@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
-    "output_len",
+    "test_llm_kwargs",
     [
-        # Use smaller output len for fast test.
-        32,
+        #TODO(wooyeon): add spec_draft_dp=2 case
+        {
+            "speculative_draft_tensor_parallel_size": 1,
+        },
     ])
+@pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
-                              batch_size: int, output_len: int):
-    """Verify greedy equality when tensor parallelism is used.
+def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
     """
-    if is_hip():
-        pytest.skip("hip is not well-supported yet")
     run_greedy_equality_correctness_test(baseline_llm_generator,
                                          test_llm_generator,
                                          batch_size,
-                                         max_output_len=output_len,
+                                         max_output_len=32,
                                          force_output_len=True)
diff --git a/vllm/config.py b/vllm/config.py
index 8d004902f..0217a2b56 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -797,6 +797,7 @@ class SpeculativeConfig:
         target_parallel_config: ParallelConfig,
         target_dtype: str,
         speculative_model: Optional[str],
+        speculative_draft_tensor_parallel_size: Optional[int],
         num_speculative_tokens: Optional[int],
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
@@ -819,6 +820,8 @@ class SpeculativeConfig:
             target_dtype (str): The data type used for the target model.
             speculative_model (Optional[str]): The name of the speculative
                 model, if provided.
+            speculative_draft_tensor_parallel_size (Optional[int]): The degree
+                of the tensor parallelism for the draft model.
             num_speculative_tokens (Optional[int]): The number of speculative
                 tokens, if provided. Will default to the number in the draft
                 model config if present, otherwise is required.
@@ -939,7 +942,8 @@ class SpeculativeConfig:
 
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
-                    target_parallel_config))
+                    target_parallel_config,
+                    speculative_draft_tensor_parallel_size))
 
         if num_speculative_tokens is None:
             raise ValueError(
@@ -993,16 +997,26 @@ class SpeculativeConfig:
 
     @staticmethod
     def create_draft_parallel_config(
-            target_parallel_config: ParallelConfig) -> ParallelConfig:
+        target_parallel_config: ParallelConfig,
+        speculative_draft_tensor_parallel_size: Optional[int]
+    ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
-        This is mostly a copy of the target parallel config. In the future the
-        draft worker can have a different parallel strategy, e.g. TP=1.
+        This is mostly a copy of the target parallel config, except the tp_size.
         """
+        if speculative_draft_tensor_parallel_size is None:
+            speculative_draft_tensor_parallel_size = \
+                  target_parallel_config.tensor_parallel_size
+        elif speculative_draft_tensor_parallel_size != 1:
+            # TODO(wooyeon): allow tp values larger than 1
+            raise ValueError(
+                f"{speculative_draft_tensor_parallel_size=} cannot be"
+                f"other value than 1")
+
         draft_parallel_config = ParallelConfig(
             pipeline_parallel_size=target_parallel_config.
             pipeline_parallel_size,
-            tensor_parallel_size=target_parallel_config.tensor_parallel_size,
+            tensor_parallel_size=speculative_draft_tensor_parallel_size,
             distributed_executor_backend=target_parallel_config.
             distributed_executor_backend,
             max_parallel_loading_workers=target_parallel_config.
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5f1decb37..a7a806b05 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -676,6 +676,28 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
+def init_world_group(ranks: List[int], local_rank: int,
+                     backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=False,
+        use_custom_allreduce=False,
+    )
+
+
+def init_model_parallel_group(group_ranks: List[List[int]], local_rank: int,
+                              backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+    )
+
+
 _TP: Optional[GroupCoordinator] = None
 
 
@@ -764,13 +786,7 @@ def init_distributed_environment(
     global _WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
-        _WORLD = GroupCoordinator(
-            group_ranks=[ranks],
-            local_rank=local_rank,
-            torch_distributed_backend=backend,
-            use_pynccl=False,
-            use_custom_allreduce=False,
-        )
+        _WORLD = init_world_group(ranks, local_rank, backend)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
@@ -827,13 +843,8 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    _TP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    _TP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank, backend)
 
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = (world_size //
@@ -845,13 +856,8 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    _PP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    _PP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank, backend)
 
 
 def ensure_model_parallel_initialized(
@@ -887,6 +893,34 @@ def model_parallel_is_initialized():
     return (_TP is not None and _PP is not None)
 
 
+_TP_STATE_PATCHED = False
+
+
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+
+
 def get_tensor_model_parallel_world_size():
     """Return world size for the tensor model parallel group."""
     return get_tp_group().world_size
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ef3161242..16374098b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -94,6 +94,7 @@ class EngineArgs:
     guided_decoding_backend: str = 'outlines'
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
+    speculative_draft_tensor_parallel_size: Optional[int] = None
     num_speculative_tokens: Optional[int] = None
     speculative_max_model_len: Optional[int] = None
     speculative_disable_by_batch_size: Optional[int] = None
@@ -537,6 +538,13 @@ class EngineArgs:
             default=EngineArgs.num_speculative_tokens,
             help='The number of speculative tokens to sample from '
             'the draft model in speculative decoding.')
+        parser.add_argument(
+            '--speculative-draft-tensor-parallel-size',
+            '-spec-draft-tp',
+            type=int,
+            default=EngineArgs.speculative_draft_tensor_parallel_size,
+            help='Number of tensor parallel replicas for '
+            'the draft model in speculative decoding.')
 
         parser.add_argument(
             '--speculative-max-model-len',
@@ -686,6 +694,8 @@ class EngineArgs:
             target_parallel_config=parallel_config,
             target_dtype=self.dtype,
             speculative_model=self.speculative_model,
+            speculative_draft_tensor_parallel_size = \
+                self.speculative_draft_tensor_parallel_size,
             num_speculative_tokens=self.num_speculative_tokens,
             speculative_disable_by_batch_size=self.
             speculative_disable_by_batch_size,
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 668ceefe6..e469fd7c3 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -6,7 +6,8 @@ import torch
 
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
-from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -28,9 +29,9 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
         super().__init__(*args, **kwargs)
 
         # Lazy initialization list.
-        self._proposer: Top1Proposer
+        self._proposer: SpeculativeProposer
 
-    def init_device(self):
+    def init_device(self) -> None:
         super().init_device()
 
         self._proposer = Top1Proposer(
@@ -40,7 +41,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
             max_proposal_len=self.max_model_len,
         )
 
-    def set_include_gpu_probs_tensor(self):
+    def set_include_gpu_probs_tensor(self) -> None:
         # Need include_gpu_probs_tensor for multi_step_worker
         self.model_runner.model.sampler.include_gpu_probs_tensor = True
 
@@ -73,7 +74,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
         # Run model sample_len times.
         model_outputs: List[SamplerOutput] = []
         for _ in range(sample_len):
-            model_output = super().execute_model(
+            model_output: List[SamplerOutput] = super().execute_model(
                 execute_model_req=copied_execute_model_req)
             assert (len(model_output) == 1
                     ), "composing multistep workers not supported"
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index fd67ceb91..b691659fb 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -3,10 +3,10 @@ from typing import List, Optional, Tuple
 
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposer
-from vllm.worker.worker_base import WorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
 
 
-class ProposerWorkerBase(WorkerBase, SpeculativeProposer):
+class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer):
     """Interface for proposer workers"""
 
     @abstractmethod
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
new file mode 100644
index 000000000..b78e44895
--- /dev/null
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -0,0 +1,149 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.distributed.parallel_state import (get_tp_group,
+                                             init_model_parallel_group,
+                                             patch_tensor_parallel_group)
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+
+logger = init_logger(__name__)
+
+
+class SmallerTpProposerWorker(ProposerWorkerBase):
+    """Class which allows a speculative draft model to run with smaller tensor
+    parallel degree than target model.
+    This reduces the communication overhead of small draft models.
+
+    To implement this feature, this class differs behavior based on is_dummy
+    flag, where dummy means worker that does not participate draft generation.
+    Participating workers use a smaller tp group by patching vLLM's tensor
+    parallel group temporarily during forward passes of draft models.
+    """
+
+    @classmethod
+    def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int,
+                          target_tensor_parallel_size: int):
+        """Wrap the worker in a SmallerTpProposerWorker if necessary.
+        """
+        if draft_tensor_parallel_size == target_tensor_parallel_size:
+            return worker
+
+        # gpu ranks that will generate draft tokens together
+        draft_ranks = list(range(draft_tensor_parallel_size))
+
+        logger.info("Wrapping {%s} in {%s}", type(worker), cls)
+        return cls(worker, draft_ranks)
+
+    def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]):
+        """Create a SmallerTpProposerWorker.
+
+        Args:
+            worker (MultiStepWorker): an actual worker wrapped with this class
+            draft_ranks (List[int]): if this value is given, only the GPU ranks
+            written in this value participate in draft generation
+        """
+        self._worker = worker
+        self._draft_ranks = draft_ranks
+
+        # init during init_device
+        self._is_dummy = False
+        self._tp_group = None
+
+    def _patch_tensor_parallel_group(self):
+        """Temporarily patch the global tp group state with its own tp group
+        state.
+        """
+        return patch_tensor_parallel_group(self._tp_group)
+
+    def init_device(self) -> None:
+        self._is_dummy = get_tp_group().rank not in self._draft_ranks
+
+        # dummy workers do nothing
+        if self._is_dummy:
+            return
+
+        # creates tp process group containing only a subset of gpu ranks
+        local_rank = get_tp_group().local_rank
+        tp_backend = torch.distributed.get_backend(get_tp_group().device_group)
+        self._tp_group = init_model_parallel_group([self._draft_ranks],
+                                                   local_rank, tp_backend)
+
+        with self._patch_tensor_parallel_group():
+            self._worker.init_device()
+
+    def set_include_gpu_probs_tensor(self) -> None:
+        if self._is_dummy:
+            return
+
+        # Need include_gpu_probs_tensor for multi_step_worker
+        self._worker.set_include_gpu_probs_tensor()
+
+    def load_model(self) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            self._worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        if self._is_dummy:
+            # this case is not used now
+            return -1, -1
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+    ) -> Tuple[List[SamplerOutput], bool]:
+        # Do not check _is_dummy, as it's always called by get_spec_proposals
+        return self._worker.sampler_output(execute_model_req, sample_len)
+
+    def get_spec_proposals(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+        if self._is_dummy:
+            return SpeculativeProposals(None, None, None)
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.get_spec_proposals(execute_model_req)
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        if self._is_dummy:
+            return []
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.execute_model(execute_model_req)
+
+    def get_cache_block_size_bytes(self) -> int:
+        if self._is_dummy:
+            # by returning zero, target worker can use the entire kv cache space
+            return 0
+
+        return self._worker.get_cache_block_size_bytes()
+
+    @property
+    def vocab_size(self) -> int:
+        return self._worker.vocab_size
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 58d3461a2..5089e3dd5 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 
-from vllm.config import SpeculativeConfig
+from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
@@ -18,6 +18,7 @@ from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker
 from vllm.spec_decode.util import (create_sequence_group_output,
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
@@ -90,7 +91,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     @classmethod
     def create_worker(
         cls,
-        scorer_worker: WorkerBase,
+        scorer_worker: Worker,
         draft_worker_kwargs: Dict[str, Any],
         disable_by_batch_size: Optional[int],
     ) -> "SpecDecodeWorker":
@@ -111,7 +112,14 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
             disable_bonus_tokens = False
         else:
+            draft_parallel_config: ParallelConfig = draft_worker_kwargs[
+                'parallel_config']
+            draft_tp = draft_parallel_config.tensor_parallel_size
+            target_tp = scorer_worker.parallel_config.tensor_parallel_size
+
             proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+            proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
+                proposer_worker, draft_tp, target_tp)
 
         logger.info("Configuring SpecDecodeWorker with proposer=%s",
                     type(proposer_worker))
-- 
GitLab


From 7b993143014c95844b380a5b05eebd14ad77b7aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Wed, 26 Jun 2024 00:41:36 +0800
Subject: [PATCH 142/376] [Misc] Remove useless code in cpu_worker (#5824)

---
 vllm/worker/cpu_worker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3ee394f99..914df0c7d 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -277,7 +277,6 @@ class CPUWorker(LoraNotSupportedWorkerBase):
             assert seq_group_metadata_list is not None
             num_seq_groups: int = len(seq_group_metadata_list)
             assert execute_model_req is not None
-            blocks_to_copy = execute_model_req.blocks_to_copy
             blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                           device="cpu",
                                           dtype=torch.int64).view(-1, 2)
-- 
GitLab


From 67882dbb44186d781ab6db9eaec08f6616dc86bd Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 25 Jun 2024 10:15:10 -0700
Subject: [PATCH 143/376] [Core] Add fault tolerance for
 `RayTokenizerGroupPool` (#5748)

---
 tests/tokenization/test_tokenizer_group.py    |  99 ++++++++++++++++
 vllm/engine/async_llm_engine.py               |   2 +
 vllm/engine/llm_engine.py                     |   2 +
 .../tokenizer_group/base_tokenizer_group.py   |   4 +
 .../tokenizer_group/ray_tokenizer_group.py    | 112 ++++++++++++++----
 5 files changed, 195 insertions(+), 24 deletions(-)

diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 31571dbff..1b9a59075 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,5 +1,7 @@
 import asyncio
 import os
+import sys
+from typing import List, Optional
 from unittest.mock import patch
 
 import pytest
@@ -100,3 +102,100 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
             max_num_seqs=1,
             max_input_length=None)
         tokenizer_pool.ping()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
+async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
+    """Test that Ray tokenizer pool group can recover from failures and
+    if that's not possible, mark itself as unhealthy."""
+
+    class FailingTokenizerGroup(TokenizerGroup):
+
+        def __init__(self,
+                     *args,
+                     fail_at: Optional[List[int]] = None,
+                     **kwargs):
+            super().__init__(*args, **kwargs)
+            self.i = 0
+            self.fail_at = fail_at or []
+
+        def encode(self, *args, **kwargs):
+            self.i += 1
+            if self.i in self.fail_at:
+                sys.exit(1)
+            return super().encode(*args, **kwargs)
+
+    class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
+        _worker_cls = FailingTokenizerGroup
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Modify fail at to not fail at all (will be re-read when actor is
+    # re-initialized).
+    fail_at[0] = 1000
+
+    # We should recover successfully.
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+
+    # Check that we have a new actor
+    assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
+    assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+
+    # We should fail after re-initialization.
+    with pytest.raises(RuntimeError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt",
+                                                lora_request=None)
+
+    # check_health should raise the same thing
+    with pytest.raises(RuntimeError):
+        tokenizer_group_pool.check_health()
+
+    # Ensure that non-ActorDiedErrors are still propagated correctly and do not
+    # cause a re-initialization.
+    fail_at = []
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=2,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Prompt too long error
+    with pytest.raises(ValueError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt" * 100,
+                                                lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    # Actors should stay the same.
+    assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index df25eb111..7994b873f 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -310,6 +310,8 @@ class _AsyncLLMEngine(LLMEngine):
         )
 
     async def check_health_async(self) -> None:
+        if self.tokenizer:
+            self.tokenizer.check_health()
         self.model_executor.check_health()
 
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f7eae257f..0ad957ef9 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1013,6 +1013,8 @@ class LLMEngine:
         return self.model_executor.pin_lora(lora_id)
 
     def check_health(self) -> None:
+        if self.tokenizer:
+            self.tokenizer.check_health()
         self.model_executor.check_health()
 
     def is_tracing_enabled(self) -> bool:
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index 3cce96e06..18fbd894f 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -53,3 +53,7 @@ class BaseTokenizerGroup(ABC):
     ) -> "PreTrainedTokenizer":
         """Get a tokenizer for a LoRA request."""
         pass
+
+    def check_health(self):
+        """Raise exception if the tokenizer group is unhealthy."""
+        return
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 7c6054168..21ec2b52b 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -2,17 +2,21 @@ import asyncio
 import os
 from typing import List, Optional
 
+from ray.exceptions import ActorDiedError
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 from transformers import PreTrainedTokenizer
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
     BaseTokenizerGroup)
 from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
     TokenizerGroup)
 
+logger = init_logger(__name__)
+
 
 class RayTokenizerGroupPool(BaseTokenizerGroup):
     """A Ray-based pool of TokenizerGroups for async tokenization."""
@@ -46,24 +50,28 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
                  ray_actor_options: dict, **tokenizer_config):
         # Store a local copy of the TokenizerGroup for quick access
         # to underlying HF tokenizers.
+        self._tokenizer_config = {
+            "tokenizer_id": tokenizer_id,
+            "enable_lora": enable_lora,
+            "max_num_seqs": max_num_seqs,
+            "max_input_length": max_input_length,
+            **tokenizer_config
+        }
         self._local_tokenizer_group = self._worker_cls(
-            tokenizer_id=tokenizer_id,
-            enable_lora=enable_lora,
-            max_num_seqs=max_num_seqs,
-            max_input_length=max_input_length,
-            **tokenizer_config,
-        )
-
-        ray_tokenizer_group_cls = ray.remote(
+            **self._tokenizer_config, )
+
+        self._ray_tokenizer_group_cls = ray.remote(
             self._worker_cls).options(**ray_actor_options)
-        self.tokenizer_actors = [
-            ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
-                                           max_num_seqs, max_input_length,
-                                           **tokenizer_config)
-            for _ in range(num_actors)
-        ]
+        self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
         self._idle_actors: Optional[asyncio.Queue] = None
 
+        # If set, actor is unhealthy. Will reraise on the next
+        # check_health call.
+        self._exception: Optional[ActorDiedError] = None
+
+    def _init_actor(self) -> ray.ObjectRef:
+        return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
+
     @property
     def pool_size(self) -> int:
         return len(self.tokenizer_actors)
@@ -78,6 +86,22 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
             for actor in self.tokenizer_actors:
                 self._idle_actors.put_nowait(actor)
 
+    def _finalize_encode(self, actor: ray.ObjectRef,
+                         original_actor: ray.ObjectRef, actor_is_alive: bool):
+        assert self._idle_actors is not None
+        # Cleanup the dead actor.
+        if not actor_is_alive or original_actor is not actor:
+            self.tokenizer_actors.remove(original_actor)
+        if actor_is_alive:
+            # Put the actor back in the queue.
+            # This is done in a finally block to ensure that the actor is
+            # always put back in the queue, even if an exception/cancellation
+            # is raised.
+            self._idle_actors.put_nowait(actor)
+            # Add back the new actor.
+            if original_actor is not actor:
+                self.tokenizer_actors.append(actor)
+
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
@@ -88,23 +112,41 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
         The actor is then put back in the queue for future use.
         This is blocking.
         """
+        self.check_health()
         self._ensure_queue_initialized()
         assert self._idle_actors is not None
 
         if self._idle_actors.empty():
             raise RuntimeError("No idle actors available.")
         actor = self._idle_actors.get_nowait()
+        actor_is_alive = True
+        original_actor = actor
         try:
             ret = ray.get(
                 actor.encode.remote(request_id=request_id,
                                     prompt=prompt,
                                     lora_request=lora_request))
+        except ActorDiedError as e:
+            # If the actor is dead, we first try to reinitialize it.
+            logger.warning("%s died with ActorDiedError, reinitializing.",
+                           actor,
+                           exc_info=e)
+            actor = self._init_actor()
+            try:
+                ret = ray.get(
+                    actor.encode.remote(request_id=request_id,
+                                        prompt=prompt,
+                                        lora_request=lora_request))
+            except ActorDiedError as e:
+                logger.error(
+                    "%s died for second time in a row, marking "
+                    "RayTokenizerGroupPool as unhealthy.", actor)
+                actor_is_alive = False
+                if not self._exception:
+                    self._exception = e
+                self.check_health()
         finally:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
+            self._finalize_encode(actor, original_actor, actor_is_alive)
         return ret
 
     async def encode_async(
@@ -120,20 +162,37 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
         The actor is then put back in the queue for future use.
         This is non-blocking.
         """
+        self.check_health()
         self._ensure_queue_initialized()
         assert self._idle_actors is not None
 
         actor = await self._idle_actors.get()
+        actor_is_alive = True
+        original_actor = actor
         try:
             ret = await actor.encode.remote(request_id=request_id,
                                             prompt=prompt,
                                             lora_request=lora_request)
+        except ActorDiedError as e:
+            # If the actor is dead, we first try to reinitialize it.
+            logger.warning("%s died with ActorDiedError, reinitializing.",
+                           actor,
+                           exc_info=e)
+            actor = self._init_actor()
+            try:
+                ret = await actor.encode.remote(request_id=request_id,
+                                                prompt=prompt,
+                                                lora_request=lora_request)
+            except ActorDiedError as e:
+                logger.error(
+                    "%s died for second time in a row, marking "
+                    "RayTokenizerGroupPool as unhealthy.", actor)
+                actor_is_alive = False
+                if not self._exception:
+                    self._exception = e
+                self.check_health()
         finally:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
+            self._finalize_encode(actor, original_actor, actor_is_alive)
         return ret
 
     def get_max_input_len(self,
@@ -155,6 +214,11 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
         return await self._local_tokenizer_group.get_lora_tokenizer_async(
             lora_request)
 
+    def check_health(self):
+        if self._exception:
+            raise RuntimeError(
+                "TokenizerGroupPool is unhealthy.") from self._exception
+
 
 def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
     """Copy over all current process environment variables to the runtime_env.
-- 
GitLab


From c18ebfdd71d16eb18617676b0b1d82ebde0027f0 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 25 Jun 2024 12:10:28 -0700
Subject: [PATCH 144/376] [doc][distributed] add both gloo and nccl tests
 (#5834)

---
 docs/source/getting_started/debugging.rst | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index a22bba147..4cd34769e 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -28,8 +28,8 @@ If it crashes, and the error trace shows somewhere around ``self.graph.replay()`
 
 Here are some common issues that can cause hangs:
 
-- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
-- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly.
+- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address.
+- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly.
 
 .. code-block:: python
 
@@ -41,7 +41,14 @@ Here are some common issues that can cause hangs:
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
     torch.cuda.synchronize()
     value = data.mean().item()
-    assert value == dist.get_world_size()
+    world_size = dist.get_world_size()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+    cpu_data = torch.FloatTensor([1,] * 128)
+    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+    value = cpu_data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
 
 .. tip::
 
-- 
GitLab


From d9b34baeddc7f48a526dc610429a3c8670b3b339 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 25 Jun 2024 15:18:03 -0400
Subject: [PATCH 145/376] [CI/Build] Add unit testing for
 FlexibleArgumentParser (#5798)

---
 tests/test_utils.py | 61 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0b674ea6a..8203b5d2f 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,8 @@ from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Protocol,
 
 import pytest
 
-from vllm.utils import deprecate_kwargs, get_open_port, merge_async_iterators
+from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
+                        get_open_port, merge_async_iterators)
 
 from .utils import error_on_warning
 
@@ -130,3 +131,61 @@ def test_get_open_port():
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
                 s3.bind(("localhost", get_open_port()))
     os.environ.pop("VLLM_PORT")
+
+
+# Tests for FlexibleArgumentParser
+@pytest.fixture
+def parser():
+    parser = FlexibleArgumentParser()
+    parser.add_argument('--image-input-type',
+                        choices=['pixel_values', 'image_features'])
+    parser.add_argument('--model-name')
+    parser.add_argument('--batch-size', type=int)
+    parser.add_argument('--enable-feature', action='store_true')
+    return parser
+
+
+def test_underscore_to_dash(parser):
+    args = parser.parse_args(['--image_input_type', 'pixel_values'])
+    assert args.image_input_type == 'pixel_values'
+
+
+def test_mixed_usage(parser):
+    args = parser.parse_args([
+        '--image_input_type', 'image_features', '--model-name',
+        'facebook/opt-125m'
+    ])
+    assert args.image_input_type == 'image_features'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_equals_sign(parser):
+    args = parser.parse_args(
+        ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
+    assert args.image_input_type == 'pixel_values'
+    assert args.model_name == 'facebook/opt-125m'
+
+
+def test_with_int_value(parser):
+    args = parser.parse_args(['--batch_size', '32'])
+    assert args.batch_size == 32
+    args = parser.parse_args(['--batch-size', '32'])
+    assert args.batch_size == 32
+
+
+def test_with_bool_flag(parser):
+    args = parser.parse_args(['--enable_feature'])
+    assert args.enable_feature is True
+    args = parser.parse_args(['--enable-feature'])
+    assert args.enable_feature is True
+
+
+def test_invalid_choice(parser):
+    with pytest.raises(SystemExit):
+        parser.parse_args(['--image_input_type', 'invalid_choice'])
+
+
+def test_missing_required_argument(parser):
+    parser.add_argument('--required-arg', required=True)
+    with pytest.raises(SystemExit):
+        parser.parse_args([])
-- 
GitLab


From dd248f76756adba4a1637b882e79ab639f957feb Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 25 Jun 2024 15:23:35 -0400
Subject: [PATCH 146/376] [Misc] Update `w4a16` `compressed-tensors` support to
 include `w8a16` (#5794)

---
 tests/quantization/test_compressed_tensors.py | 23 +++++++--------
 .../compressed_tensors/compressed_tensors.py  | 28 +++++++++++--------
 .../compressed_tensors/schemes/__init__.py    |  5 ++--
 .../schemes/compressed_tensors_w4a16_24.py    |  1 +
 ...s_w4a16.py => compressed_tensors_wNa16.py} |  5 ++--
 5 files changed, 36 insertions(+), 26 deletions(-)
 rename vllm/model_executor/layers/quantization/compressed_tensors/schemes/{compressed_tensors_w4a16.py => compressed_tensors_wNa16.py} (98%)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index aaa366335..6eb7ff72f 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -8,9 +8,9 @@ import torch
 
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
+    CompressedTensorsWNA16)
 
 
 @pytest.mark.parametrize("model_args", [
@@ -74,26 +74,27 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
         assert qkv_proj.weight.dtype is torch.int8
 
 
-@pytest.mark.parametrize("w4a16_args", [
-    ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None),
-    ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
-])
-def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
-    model, strategy, group = w4a16_args
+@pytest.mark.parametrize(
+    "wNa16_args",
+    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
+     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
+     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
+def test_compressed_tensors_w4a16(vllm_runner, wNa16_args):
+    model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
         qkv_proj = layer.self_attn.qkv_proj
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.scheme.group_size == group
 
         assert qkv_proj.weight_packed.dtype is torch.int32
         assert qkv_proj.weight_scale.dtype is torch.float16
-        assert qkv_proj.weight_packed.pack_factor == 8
+        assert qkv_proj.weight_packed.pack_factor == pack_factor
 
 
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 44dd024af..c69e2f3bc 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -7,9 +7,10 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW4A16,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     find_first_name_or_class_match)
@@ -108,26 +109,31 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         return is_8_bits and is_token and is_symmetric and is_dynamic
 
-    def _is_w4a16(self, weight_quant: BaseModel,
-                  input_quant: BaseModel) -> bool:
+    def _is_wNa16_group_channel(self, weight_quant: BaseModel,
+                                input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
-        is_4_bits = weight_quant.num_bits == 4
         is_symmetric = weight_quant.symmetric
+        is_channel_group = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value)
         is_static = not weight_quant.dynamic
 
-        return is_4_bits and input_quant_none and is_symmetric and is_static
+        return (is_channel_group and input_quant_none and is_symmetric
+                and is_static)
 
     def _get_schema(self, weight_quant: BaseModel,
                     input_quant: BaseModel) -> "CompressedTensorsScheme":
 
-        if self._is_w4a16(weight_quant, input_quant):
-            if self.quant_format == CompressionFormat.marlin_24.value:
+        if self._is_wNa16_group_channel(weight_quant, input_quant):
+            if (self.quant_format == CompressionFormat.marlin_24.value
+                    and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
                     group_size=weight_quant.group_size)
-            if self.quant_format == CompressionFormat.pack_quantized.value:
-                return CompressedTensorsW4A16(
+            if (self.quant_format == CompressionFormat.pack_quantized.value
+                    and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
+                return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
                     group_size=weight_quant.group_size)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 3c95aa11f..f6d20ce2c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,10 +1,11 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
-from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
 from .compressed_tensors_w4a16_24 import (  # noqa: F401
-    CompressedTensorsW4A16Sparse24)
+    W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
     CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
     CompressedTensorsW8A8StaticTensor)
+from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS  # noqa: F401
+from .compressed_tensors_wNa16 import CompressedTensorsWNA16  # noqa: F401
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index d7e04ddb8..607029c81 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW4A16Sparse24"]
+W4A16SPARSE24_SUPPORTED_BITS = [4]
 
 
 class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
similarity index 98%
rename from vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 373458cff..7707ea6ee 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -11,10 +11,11 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
     marlin_permute_scales)
 from vllm.model_executor.utils import set_weight_attrs
 
-__all__ = ["CompressedTensorsW4A16"]
+__all__ = ["CompressedTensorsWNA16"]
+WNA16_SUPPORTED_BITS = [4, 8]
 
 
-class CompressedTensorsW4A16(CompressedTensorsScheme):
+class CompressedTensorsWNA16(CompressedTensorsScheme):
 
     def __init__(self,
                  strategy: str,
-- 
GitLab


From bc34937d68e9715d8416457539fb528301cf6269 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 25 Jun 2024 15:25:52 -0700
Subject: [PATCH 147/376] [Hardware][TPU] Refactor TPU backend (#5831)

---
 vllm/executor/tpu_executor.py   | 58 +++++++++++++++++++++------------
 vllm/worker/tpu_model_runner.py |  4 +++
 vllm/worker/tpu_worker.py       | 35 +++++++++++++-------
 3 files changed, 65 insertions(+), 32 deletions(-)

diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 5ed00e137..7fe5349c9 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -1,4 +1,4 @@
-from typing import List, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import torch
 
@@ -26,29 +26,45 @@ class TPUExecutor(ExecutorBase):
             self.model_config.dtype = torch.bfloat16
 
         # Instantiate the worker and load the model to the device.
-        self._init_worker()
-
-    def _init_worker(self):
-        from vllm.worker.tpu_worker import TPUWorker
+        self.driver_worker = self._create_worker()
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
 
-        assert self.parallel_config.world_size == 1, (
-            "TPUExecutor currently only supports a single TPU chip.")
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        self.driver_worker = TPUWorker(
-            self.model_config,
-            self.parallel_config,
-            self.scheduler_config,
-            self.device_config,
-            self.cache_config,
-            self.load_config,
-            self.vision_language_config,
-            local_rank=0,
-            rank=0,
+    def _get_worker_kwargs(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+        distributed_init_method: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Return worker init args for a given rank."""
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=local_rank,
+            rank=rank,
             distributed_init_method=distributed_init_method,
+            vision_language_config=self.vision_language_config,
+            is_driver_worker=rank == 0,
         )
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
+
+    def _create_worker(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+        distributed_init_method: Optional[str] = None,
+    ):
+        from vllm.worker.tpu_worker import TPUWorker
+
+        worker = TPUWorker(**self._get_worker_kwargs(local_rank, rank,
+                                                     distributed_init_method))
+        return worker
 
     def initialize_cache(
         self,
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 5003d3b0c..2d8fffe5a 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -33,6 +33,7 @@ class TPUModelRunner:
         cache_config: CacheConfig,
         load_config: LoadConfig,
         vision_language_config: Optional[VisionLanguageConfig] = None,
+        is_driver_worker: bool = False,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -41,6 +42,7 @@ class TPUModelRunner:
         self.cache_config = cache_config
         self.load_config = load_config
         self.vision_language_config = vision_language_config
+        self.is_driver_worker = is_driver_worker
 
         self.block_size = self.cache_config.block_size
         self.max_num_blocks_per_seq = (self.model_config.max_model_len //
@@ -373,6 +375,8 @@ class TPUModelRunner:
         inputs = self.prepare_inputs(seq_group_metadata_list)
         next_token_ids = self.model(inputs[0], inputs[1], kv_caches,
                                     *inputs[2:])
+        if not self.is_driver_worker:
+            return []
         next_token_ids = next_token_ids.cpu().tolist()
 
         i = 0
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 04576015d..828bb89d7 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -34,6 +34,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         local_rank: int,
         rank: int,
         distributed_init_method: str,
+        is_driver_worker: bool,
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -45,6 +46,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
 
         assert self.device_config.device_type == "tpu"
         if self.cache_config.cache_dtype == "auto":
@@ -53,10 +55,14 @@ class TPUWorker(LoraNotSupportedWorkerBase):
             self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 self.cache_config.cache_dtype]
 
-        self.model_runner = TPUModelRunner(model_config, parallel_config,
-                                           scheduler_config, device_config,
-                                           cache_config, load_config,
-                                           vision_language_config)
+        self.model_runner = TPUModelRunner(model_config,
+                                           parallel_config,
+                                           scheduler_config,
+                                           device_config,
+                                           cache_config,
+                                           load_config,
+                                           vision_language_config,
+                                           is_driver_worker=is_driver_worker)
 
     def init_device(self) -> None:
         os.environ["PJRT_DEVICE"] = "TPU"
@@ -175,16 +181,13 @@ class TPUWorker(LoraNotSupportedWorkerBase):
 
     def execute_model(
         self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
+        execute_model_req: Optional[ExecuteModelRequest] = None,
     ) -> List[SamplerOutput]:
-        if execute_model_req is None:
-            return []
-
-        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-        num_seq_groups = len(seq_group_metadata_list)
-        if num_seq_groups == 0:
+        if not self.is_driver_worker:
+            self._execute_model_non_driver()
             return []
 
+        assert execute_model_req is not None
         # Currently, TPUWorker does not support swapping.
         # TODO(woosuk): Support block copying.
         assert len(execute_model_req.blocks_to_swap_in) == 0, (
@@ -193,6 +196,16 @@ class TPUWorker(LoraNotSupportedWorkerBase):
             "Swapping is not supported for the TPU backend.")
         assert len(execute_model_req.blocks_to_copy) == 0
 
+        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+        assert len(seq_group_metadata_list) > 0
         output = self.model_runner.execute_model(seq_group_metadata_list,
                                                  self.tpu_cache)
         return [output]
+
+    def start_worker_execution_loop(self) -> None:
+        while self._execute_model_non_driver():
+            pass
+
+    def _execute_model_non_driver(self) -> bool:
+        self.model_runner.execute_model(None, self.tpu_cache)
+        return True
-- 
GitLab


From dd793d1de59b5efad25f4794b68cb935824c7a11 Mon Sep 17 00:00:00 2001
From: Matt Wong <156021403+mawong-amd@users.noreply.github.com>
Date: Tue, 25 Jun 2024 17:56:15 -0500
Subject: [PATCH 148/376] [Hardware][AMD][CI/Build][Doc] Upgrade to ROCm 6.1,
 Dockerfile improvements, test fixes (#5422)

---
 CMakeLists.txt                                |  20 +-
 Dockerfile.rocm                               | 209 ++++++++++++------
 cmake/utils.cmake                             |  20 +-
 .../getting_started/amd-installation.rst      |   6 +-
 tests/async_engine/test_openapi_server_ray.py |   4 +-
 tests/distributed/test_utils.py               |  17 +-
 tests/entrypoints/test_openai_embedding.py    |   4 +-
 tests/entrypoints/test_openai_server.py       |   4 +-
 tests/entrypoints/test_openai_vision.py       |   4 +-
 tests/utils.py                                |  38 +++-
 vllm/config.py                                |  10 +-
 .../custom_all_reduce_utils.py                |  11 +-
 vllm/executor/multiproc_gpu_executor.py       |   8 +-
 vllm/utils.py                                 |  16 +-
 vllm/worker/worker_base.py                    |  10 +-
 15 files changed, 259 insertions(+), 122 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa15b632c..801429096 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,8 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # versions are derived from Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
-set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
-set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -98,18 +97,11 @@ elseif(HIP_FOUND)
   # .hip extension automatically, HIP must be enabled explicitly.
   enable_language(HIP)
 
-  # ROCm 5.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
-      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
-  endif()
-
-  # ROCm 6.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
-      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
+  # ROCm 5.X and 6.X
+  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
+      "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 6bda69685..652f04adf 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,34 +1,35 @@
-# default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-FROM $BASE_IMAGE
-
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-RUN echo "Base image is $BASE_IMAGE"
-
-ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
-    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
+# Default ROCm 6.1 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+
+# Tested and supported base rocm/pytorch images
+ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
+    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
+    ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+
+# Default ROCm ARCHes to build vLLM for.
+ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+
+# Whether to build CK-based flash-attention
+# If 0, will not build flash attention
+# This is useful for gfx target where flash-attention is not supported
+# (i.e. those that do not appear in `FA_GFX_ARCHS`)
+# Triton FA is used by default on ROCm now so this is unnecessary.
+ARG BUILD_FA="1"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
-
 ARG FA_BRANCH="ae7928c"
-RUN echo "FA_BRANCH is $FA_BRANCH"
 
-# whether to build flash-attention
-# if 0, will not build flash attention
-# this is useful for gfx target where flash-attention is not supported
-# In that case, we need to use the python reference attention implementation in vllm
-ARG BUILD_FA="1"
-
-# whether to build triton on rocm
+# Whether to build triton on rocm
 ARG BUILD_TRITON="1"
+ARG TRITON_BRANCH="0ef1848"
 
-# Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+### Base image build stage
+FROM $BASE_IMAGE AS base
+
+# Import arg(s) defined before this build stage
+ARG PYTORCH_ROCM_ARCH
 
 # Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
 RUN apt-get update && apt-get install -y \
     curl \
     ca-certificates \
@@ -39,79 +40,159 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     wget \
     unzip \
-    nvidia-cuda-toolkit \
     tmux \
     ccache \
  && rm -rf /var/lib/apt/lists/*
 
-### Mount Point ###
-# When launching the container, mount the code directory to /app
+# When launching the container, mount the code directory to /vllm-workspace
 ARG APP_MOUNT=/vllm-workspace
-VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}
 
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN pip install --upgrade pip
+# Remove sccache so it doesn't interfere with ccache
+# TODO: implement sccache support across components
+RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
+# Install torch == 2.4.0 on ROCm
+RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-5.7"*) \
+            pip uninstall -y torch \
+            && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
+        *"rocm-6.0"*) \
+            pip uninstall -y torch \
+            && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
+        *"rocm-6.1"*) \
+            pip uninstall -y torch \
+            && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
+        *) ;; esac
 
 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
 ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
 
-# Install ROCm flash-attention
-RUN if [ "$BUILD_FA" = "1" ]; then \
-    mkdir libs \
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+ENV CCACHE_DIR=/root/.cache/ccache
+
+
+### AMD-SMI build stage
+FROM base AS build_amdsmi
+# Build amdsmi wheel always
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=/install
+
+
+### Flash-Attention wheel build stage
+FROM base AS build_fa
+ARG BUILD_FA
+ARG FA_GFX_ARCHS
+ARG FA_BRANCH
+# Build ROCm flash-attention wheel if `BUILD_FA = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_FA" = "1" ]; then \
+    mkdir -p libs \
     && cd libs \
     && git clone https://github.com/ROCm/flash-attention.git \
     && cd flash-attention \
-    && git checkout ${FA_BRANCH} \
+    && git checkout "${FA_BRANCH}" \
     && git submodule update --init \
-    && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \
-        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
-    && python3 setup.py install \
-    && cd ..; \
+    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-5.7"*) \
+            export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
+            && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
+        *) ;; esac \
+    && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
     fi
 
-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually removed it so that later steps of numpy upgrade can continue
-RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
-    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
-# build triton
-RUN if [ "$BUILD_TRITON" = "1" ]; then \
+### Triton wheel build stage
+FROM base AS build_triton
+ARG BUILD_TRITON
+ARG TRITON_BRANCH
+# Build triton wheel if `BUILD_TRITON = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_TRITON" = "1" ]; then \
     mkdir -p libs \
     && cd libs \
-    && pip uninstall -y triton \
-    && git clone https://github.com/ROCm/triton.git \
-    && cd triton/python \
-    && pip3 install . \
-    && cd ../..; \
+    && git clone https://github.com/OpenAI/triton.git \
+    && cd triton \
+    && git checkout "${TRITON_BRANCH}" \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
     fi
 
-WORKDIR /vllm-workspace
+
+### Final vLLM build stage
+FROM base AS final
+# Import the vLLM development directory from the build context
 COPY . .
 
-#RUN python3 -m pip install pynvml # to be removed eventually
-RUN python3 -m pip install --upgrade pip numba
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually remove it so that later steps of numpy upgrade can continue
+RUN case "$(which python3)" in \
+        *"/opt/conda/envs/py_3.9"*) \
+            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
+        *) ;; esac
+
+# Package upgrades for useful functionality or to avoid dependency issues
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --upgrade numba scipy huggingface-hub[cli]
 
-# make sure punica kernels are built (for LoRA)
+# Make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+# Silences the HF Tokenizers warning
+ENV TOKENIZERS_PARALLELISM=false
 
-ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
+RUN --mount=type=cache,target=${CCACHE_DIR} \
     --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
-    && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
-       patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \
-    && python3 setup.py install \
-    && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \
-    && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \
-    && cd ..
+    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.0"*) \
+            patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
+        *"rocm-6.1"*) \
+            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
+            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
+            && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
+            # Prevent interference if torch bundles its own HIP runtime
+            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
+        *) ;; esac \
+    && python3 setup.py clean --all \
+    && python3 setup.py develop
+
+# Copy amdsmi wheel into final image
+RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
+    mkdir -p libs \
+    && cp /install/*.whl libs \
+    # Preemptively uninstall to avoid same-version no-installs
+    && pip uninstall -y amdsmi;
 
+# Copy triton wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && pip uninstall -y triton; fi
+
+# Copy flash-attn wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && pip uninstall -y flash-attn; fi
+
+# Install wheels that were built to the final image
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if ls libs/*.whl; then \
+    pip install libs/*.whl; fi
 
 CMD ["/bin/bash"]
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 071e16336..4869cad54 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -147,19 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
   if (${GPU_LANG} STREQUAL "HIP")
     #
     # `GPU_ARCHES` controls the `--offload-arch` flags.
-    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
-    # via the `PYTORCH_ROCM_ARCH` env variable.
     #
-
+    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
+    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
+    # "rocm_agent_enumerator" in "enable_language(HIP)"
+    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
+    #
+    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
+    else()
+      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
+    endif()
     #
     # Find the intersection of the supported + detected architectures to
     # set the module architecture flags.
     #
-
-    set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
-
     set(${GPU_ARCHES})
-    foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS})
+    foreach (_ARCH ${HIP_ARCHITECTURES})
       if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
         list(APPEND ${GPU_ARCHES} ${_ARCH})
       endif()
@@ -167,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
 
     if(NOT ${GPU_ARCHES})
       message(FATAL_ERROR
-        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
+        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
         " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
     endif()
 
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 61fcd45a2..cc41d4729 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -88,7 +88,7 @@ Option 2: Build from source
 - `Pytorch <https://pytorch.org/>`_
 - `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
 
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
 
 Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
 
@@ -126,12 +126,12 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl
 
     $ cd vllm
     $ pip install -U -r requirements-rocm.txt
-    $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
+    $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
 
 
 .. tip::
 
     - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention. 
+    - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
     - The ROCm version of pytorch, ideally, should match the ROCm driver version.
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index cc05d79e5..332937b87 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -4,7 +4,7 @@ import pytest
 # and debugging.
 import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
@@ -12,7 +12,7 @@ MODEL_NAME = "facebook/opt-125m"
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 49d11daca..9ff11b0d2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,8 +1,8 @@
-import os
-
 import ray
 
-from vllm.utils import cuda_device_count_stateless
+import vllm.envs as envs
+from vllm.utils import (cuda_device_count_stateless, is_hip,
+                        update_environment_variables)
 
 
 @ray.remote
@@ -12,16 +12,21 @@ class _CUDADeviceCountStatelessTestActor:
         return cuda_device_count_stateless()
 
     def set_cuda_visible_devices(self, cuda_visible_devices: str):
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     def get_cuda_visible_devices(self):
-        return os.environ["CUDA_VISIBLE_DEVICES"]
+        return envs.CUDA_VISIBLE_DEVICES
 
 
 def test_cuda_device_count_stateless():
     """Test that cuda_device_count_stateless changes return value if
     CUDA_VISIBLE_DEVICES is changed."""
-
+    if is_hip():
+        # Set HIP_VISIBLE_DEVICES == CUDA_VISIBLE_DEVICES. Conversion
+        # is handled by `update_environment_variables`
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
     actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
         num_gpus=2).remote()
     assert sorted(ray.get(
diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/test_openai_embedding.py
index 2496d2ac3..45f701733 100644
--- a/tests/entrypoints/test_openai_embedding.py
+++ b/tests/entrypoints/test_openai_embedding.py
@@ -2,7 +2,7 @@ import openai
 import pytest
 import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
@@ -11,7 +11,7 @@ pytestmark = pytest.mark.openai
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index c22a675ff..5196d8181 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -16,7 +16,7 @@ from openai import BadRequestError
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -81,7 +81,7 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py
index 03dc5d116..0e8d88b76 100644
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/test_openai_vision.py
@@ -8,7 +8,7 @@ import ray
 
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
 LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
@@ -27,7 +27,7 @@ pytestmark = pytest.mark.openai
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
diff --git a/tests/utils.py b/tests/utils.py
index 174efca4a..2a5f82b91 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -15,9 +15,30 @@ from vllm.distributed import (ensure_model_parallel_initialized,
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.utils import get_open_port, is_hip
 
-if (not is_hip()):
+if is_hip():
+    from amdsmi import (amdsmi_get_gpu_vram_usage,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down)
+
+    @contextmanager
+    def _nvml():
+        try:
+            amdsmi_init()
+            yield
+        finally:
+            amdsmi_shut_down()
+else:
     from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                        nvmlInit)
+                        nvmlInit, nvmlShutdown)
+
+    @contextmanager
+    def _nvml():
+        try:
+            nvmlInit()
+            yield
+        finally:
+            nvmlShutdown()
+
 
 # Path to root of repository so that utilities can be imported by ray workers
 VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
@@ -160,20 +181,25 @@ def error_on_warning():
         yield
 
 
+@_nvml()
 def wait_for_gpu_memory_to_clear(devices: List[int],
                                  threshold_bytes: int,
                                  timeout_s: float = 120) -> None:
     # Use nvml instead of pytorch to reduce measurement error from torch cuda
     # context.
-    nvmlInit()
     start_time = time.time()
     while True:
         output: Dict[int, str] = {}
         output_raw: Dict[int, float] = {}
         for device in devices:
-            dev_handle = nvmlDeviceGetHandleByIndex(device)
-            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
-            gb_used = mem_info.used / 2**30
+            if is_hip():
+                dev_handle = amdsmi_get_processor_handles()[device]
+                mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
+                gb_used = mem_info["vram_used"] / 2**10
+            else:
+                dev_handle = nvmlDeviceGetHandleByIndex(device)
+                mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+                gb_used = mem_info.used / 2**30
             output_raw[device] = gb_used
             output[device] = f'{gb_used:.02f}'
 
diff --git a/vllm/config.py b/vllm/config.py
index 0217a2b56..0c4d770e4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -7,13 +7,15 @@ from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple,
 import torch
 from transformers import PretrainedConfig, PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_tpu, is_xpu)
+                        is_hip, is_neuron, is_tpu, is_xpu,
+                        update_environment_variables)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -634,6 +636,12 @@ class ParallelConfig:
             self.distributed_executor_backend = backend
             logger.info("Defaulting to use %s for distributed inference",
                         backend)
+        # If CUDA_VISIBLE_DEVICES is set on ROCm prior to vLLM init,
+        # propagate changes to HIP_VISIBLE_DEVICES (conversion handled by
+        # the update_environment_variables function)
+        if is_hip() and envs.CUDA_VISIBLE_DEVICES:
+            update_environment_variables(
+                {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
 
         self._verify_args()
 
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index d3e41fa71..6f1aaed98 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -13,7 +13,8 @@ import torch.multiprocessing as mp
 import vllm.envs as envs
 from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils import (cuda_device_count_stateless,
+                        update_environment_variables)
 
 logger = init_logger(__name__)
 
@@ -24,7 +25,8 @@ def producer(batch_src: Sequence[int],
              result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     lib = CudaRTLibrary()
     for i in batch_src:
@@ -56,7 +58,8 @@ def consumer(batch_tgt: Sequence[int],
              result_queue,
              cuda_visible_devices: Optional[str] = None):
     if cuda_visible_devices is not None:
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+        update_environment_variables(
+            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     lib = CudaRTLibrary()
     for j in batch_tgt:
@@ -123,7 +126,7 @@ def can_actually_p2p(
     processes for testing all pairs of GPUs in batch. The trick is to reset
     the device after each test (which is not available in PyTorch).
     """  # noqa
-    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
     # pass the CUDA_VISIBLE_DEVICES to the child process
     # to make sure they see the same set of GPUs
 
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index e63e5a3a0..a5b1d27f2 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -11,7 +11,8 @@ from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (cuda_device_count_stateless,
                         get_distributed_init_method, get_open_port,
-                        get_vllm_instance_id, make_async)
+                        get_vllm_instance_id, make_async,
+                        update_environment_variables)
 
 logger = init_logger(__name__)
 
@@ -25,8 +26,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
 
         # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
         if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            os.environ["CUDA_VISIBLE_DEVICES"] = (",".join(
-                map(str, range(world_size))))
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
 
         # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
         os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
diff --git a/vllm/utils.py b/vllm/utils.py
index f0c7df5cf..92abdb3fb 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -376,6 +376,10 @@ def get_open_port() -> int:
 
 
 def update_environment_variables(envs: Dict[str, str]):
+    if is_hip() and "CUDA_VISIBLE_DEVICES" in envs:
+        # Propagate changes to CUDA_VISIBLE_DEVICES to
+        # ROCm's HIP_VISIBLE_DEVICES as well
+        envs["HIP_VISIBLE_DEVICES"] = envs["CUDA_VISIBLE_DEVICES"]
     for k, v in envs.items():
         if k in os.environ and os.environ[k] != v:
             logger.warning(
@@ -779,9 +783,14 @@ def _cuda_device_count_stateless(
 
     if not torch.cuda._is_compiled():
         return 0
-    # bypass _device_count_nvml() if rocm (not supported)
-    nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml()
-    r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+    if is_hip():
+        # ROCm uses amdsmi instead of nvml for stateless device count
+        # This requires a sufficiently modern version of Torch 2.4.0
+        raw_count = torch.cuda._device_count_amdsmi() if (hasattr(
+            torch.cuda, "_device_count_amdsmi")) else -1
+    else:
+        raw_count = torch.cuda._device_count_nvml()
+    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
     return r
 
 
@@ -795,7 +804,6 @@ def cuda_device_count_stateless() -> int:
 
     # This can be removed and simply replaced with torch.cuda.get_device_count
     # after https://github.com/pytorch/pytorch/pull/122815 is released.
-
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index dc09718de..99482aa93 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Set, Tuple
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (enable_trace_function_call_for_thread,
+from vllm.utils import (enable_trace_function_call_for_thread, is_hip,
                         update_environment_variables)
 
 logger = init_logger(__name__)
@@ -125,6 +125,14 @@ class WorkerWrapperBase:
             # overwriting CUDA_VISIBLE_DEVICES is desired behavior
             # suppress the warning in `update_environment_variables`
             del os.environ[key]
+            if is_hip():
+                hip_env_var = "HIP_VISIBLE_DEVICES"
+                if hip_env_var in os.environ:
+                    logger.warning(
+                        "Ignoring pre-set environment variable `%s=%s` as "
+                        "%s has also been set, which takes precedence.",
+                        hip_env_var, os.environ[hip_env_var], key)
+                os.environ.pop(hip_env_var, None)
         update_environment_variables(envs)
 
     def init_worker(self, *args, **kwargs):
-- 
GitLab


From f178e56c68d97e3a29a8a885a09dd61f8d534732 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 25 Jun 2024 16:58:23 -0700
Subject: [PATCH 149/376] [Hardware][TPU] Raise errors for unsupported sampling
 params (#5850)

---
 vllm/worker/tpu_model_runner.py | 63 +++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 2d8fffe5a..2c70c1f91 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -20,6 +20,8 @@ from vllm.utils import make_tensor_with_pad
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = 0  # FIXME(woosuk)
+# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
+_ENABLE_TOP_P = False
 
 
 class TPUModelRunner:
@@ -339,9 +341,34 @@ class TPUModelRunner:
             assert seq_group_metadata.sampling_params is not None
             sampling_params = seq_group_metadata.sampling_params
 
+            # NOTE(woosuk): Here we mimic argmax sampling by applying a very
+            # low temperature. This is not accurate.
             t.append(sampling_params.temperature
                      if sampling_params.temperature >= 1e-5 else 1e-5)
+            if sampling_params.top_p != 1 and not _ENABLE_TOP_P:
+                raise NotImplementedError(
+                    "Top-p sampling is currently disabled for the TPU backend "
+                    "due to performance issues.")
             p.append(sampling_params.top_p)
+            if sampling_params.top_k != -1:
+                raise NotImplementedError(
+                    "Top-k sampling is currently disabled for the TPU backend "
+                    "due to performance issues.")
+            if sampling_params.best_of > 1:
+                raise NotImplementedError(
+                    "best_of > 1 is not currently supported by the TPU "
+                    "backend.")
+            if sampling_params.use_beam_search:
+                raise NotImplementedError(
+                    "Beam search is not supported by the TPU backend.")
+            if sampling_params.logprobs is not None:
+                raise NotImplementedError(
+                    "logprobs is not currently supported by the TPU backend.")
+            if sampling_params.prompt_logprobs is not None:
+                raise NotImplementedError(
+                    "prompt_logprobs is not currently supported by the TPU "
+                    "backend.")
+
         num_paddings = padded_batch_size - len(seq_group_metadata_list)
         t += [1.0] * num_paddings
         p += [1.0] * num_paddings
@@ -350,35 +377,32 @@ class TPUModelRunner:
         p = torch.tensor(p, dtype=torch.float32, device=self.device)
         return t, p
 
-    def prepare_inputs(
+    def _execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ):
-        assert seq_group_metadata_list is not None
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> List[CompletionSequenceGroupOutput]:
+        # Prepare inputs.
         assert len(seq_group_metadata_list) > 0
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
-        if seq_group_metadata_list[0].is_prompt:
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        if is_prompt:
             inputs = self._prepare_prompt(seq_group_metadata_list)
         else:
             inputs = self._prepare_decode(seq_group_metadata_list)
         padded_batch_size = inputs[0].shape[0]
-        sample_inputs = self._prepare_sample(seq_group_metadata_list,
-                                             padded_batch_size)
-        return inputs + sample_inputs
+        t, p = self._prepare_sample(seq_group_metadata_list, padded_batch_size)
 
-    def _execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> List[CompletionSequenceGroupOutput]:
-        inputs = self.prepare_inputs(seq_group_metadata_list)
+        # Execute the model.
         next_token_ids = self.model(inputs[0], inputs[1], kv_caches,
-                                    *inputs[2:])
-        if not self.is_driver_worker:
-            return []
+                                    *inputs[2:], t, p)
+        # Retrieve the outputs to CPU.
         next_token_ids = next_token_ids.cpu().tolist()
 
+        # NOTE(woosuk): Minimal code to construct the sampler outputs.
+        # The TPU backend does not reuse the sampler, since the TPU backend
+        # does not support the advanced sampling parameters such as logprobs.
         i = 0
         sampler_outputs = []
         for seq_group_metadata in seq_group_metadata_list:
@@ -400,6 +424,7 @@ class TPUModelRunner:
         kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
     ) -> SamplerOutput:
         assert seq_group_metadata_list is not None
+        assert len(seq_group_metadata_list) > 0
         if seq_group_metadata_list[0].is_prompt:
             # NOTE(woosuk): To reduce the compilation time, we only compile the
             # prefill inputs with batch size 1. Because the scheduler is not
@@ -492,8 +517,8 @@ class ModelWrapper(nn.Module):
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
 
         logits = logits / t.unsqueeze(dim=1)
-        # FIXME(woosuk): Disabled top-p sampling since it's too slow.
-        # logits = _apply_top_p(logits, p.unsqueeze(dim=1))
+        if _ENABLE_TOP_P:
+            logits = _apply_top_p(logits, p.unsqueeze(dim=1))
         probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
         # FIXME(woosuk): best_of > 1 is not supported.
         next_token_ids = torch.multinomial(probs, num_samples=1).squeeze(dim=1)
-- 
GitLab


From c2a8ac75e03aec19dad397a8e64377d37c67239a Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 26 Jun 2024 01:04:08 +0100
Subject: [PATCH 150/376] [CI/Build] Add E2E tests for MLPSpeculator (#5791)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/spec_decode/e2e/test_mlp_correctness.py | 216 ++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 tests/spec_decode/e2e/test_mlp_correctness.py

diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
new file mode 100644
index 000000000..9a9f2acbb
--- /dev/null
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -0,0 +1,216 @@
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, MLPSpeculator would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+# main model
+MAIN_MODEL = "ibm-granite/granite-3b-code-instruct"
+
+# speculative model
+SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator"
+
+# max. number of speculative tokens: this corresponds to
+# n_predict in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 5
+
+# precision
+PRECISION = "float16"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int):
+    """Verify greedy equality with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                    test_llm_generator,
+                                                    batch_size: int,
+                                                    output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": SPEC_MODEL,
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+                         batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": SPEC_MODEL,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
-- 
GitLab


From 82079729ccd0830ce77fcc5fd7ea2be3bf81ccaf Mon Sep 17 00:00:00 2001
From: aws-patlange <90803007+aws-patlange@users.noreply.github.com>
Date: Tue, 25 Jun 2024 19:52:10 -0700
Subject: [PATCH 151/376] [Bugfix] Fix assertion in NeuronExecutor (#5841)

---
 vllm/executor/neuron_executor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index c5e2fb0f6..1a3329749 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -48,9 +48,9 @@ class NeuronExecutor(ExecutorBase):
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        assert (execute_model_req.blocks_to_swap_in == {}
-                and execute_model_req.blocks_to_swap_out == {}
-                and execute_model_req.blocks_to_copy == {}), (
+        assert (not execute_model_req.blocks_to_swap_in
+                and not execute_model_req.blocks_to_swap_out
+                and not execute_model_req.blocks_to_copy), (
                     "Cache operations are not supported for Neuron backend.")
         assert execute_model_req.num_lookahead_slots == 0, (
             "lookahead not supported for Neuron backend.")
-- 
GitLab


From dda4811591fdb90d263bc9b8ac522436369aef13 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 25 Jun 2024 20:30:03 -0700
Subject: [PATCH 152/376] [Core] Refactor Worker and ModelRunner to consolidate
 control plane communication (#5408)

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
Signed-off-by: Stephanie <swang@anyscale.com>
Co-authored-by: Stephanie <swang@anyscale.com>
---
 tests/worker/test_model_input.py            | 152 ++++++++
 tests/worker/test_model_runner.py           |  57 +--
 vllm/attention/backends/abstract.py         |   6 +-
 vllm/attention/backends/blocksparse_attn.py |   4 +-
 vllm/attention/backends/flash_attn.py       |   4 +-
 vllm/attention/backends/flashinfer.py       |   4 +-
 vllm/attention/backends/ipex_attn.py        |   4 +-
 vllm/attention/backends/pallas.py           |   4 +-
 vllm/attention/backends/rocm_flash_attn.py  |   4 +-
 vllm/attention/backends/torch_sdpa.py       |   4 +-
 vllm/attention/backends/xformers.py         |   4 +-
 vllm/executor/distributed_gpu_executor.py   |  16 +-
 vllm/executor/executor_base.py              |   4 +-
 vllm/executor/gpu_executor.py               |   2 +-
 vllm/executor/multiproc_gpu_executor.py     |   8 +-
 vllm/executor/neuron_executor.py            |   3 +-
 vllm/executor/ray_gpu_executor.py           |   5 +-
 vllm/sequence.py                            |   3 +-
 vllm/spec_decode/mlp_speculator_worker.py   |   3 +-
 vllm/worker/cpu_model_runner.py             | 161 +++++----
 vllm/worker/cpu_worker.py                   |  85 ++---
 vllm/worker/embedding_model_runner.py       | 129 +++----
 vllm/worker/model_runner.py                 | 367 +++++++++++---------
 vllm/worker/model_runner_base.py            | 157 +++++++++
 vllm/worker/neuron_model_runner.py          |  64 +++-
 vllm/worker/neuron_worker.py                |  39 +--
 vllm/worker/worker.py                       | 129 ++-----
 vllm/worker/worker_base.py                  | 170 ++++++++-
 vllm/worker/xpu_model_runner.py             |  91 ++++-
 29 files changed, 1108 insertions(+), 575 deletions(-)
 create mode 100644 tests/worker/test_model_input.py
 create mode 100644 vllm/worker/model_runner_base.py

diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
new file mode 100644
index 000000000..ae818ee36
--- /dev/null
+++ b/tests/worker/test_model_input.py
@@ -0,0 +1,152 @@
+import dataclasses
+from typing import List, Tuple, Type
+
+import torch
+
+from vllm.attention import AttentionMetadata
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.worker.embedding_model_runner import (
+    ModelInputForGPUWithPoolingMetadata)
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class MockAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_impl_cls():
+        raise NotImplementedError
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return AttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        pass
+
+
+def test_model_runner_input():
+    sampling_metadata = SamplingMetadata(
+        ["seq_group"],
+        "selected_token_indices",
+        "categorized_sample_indices",
+        "num_prompts",
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+    )
+    model_input = ModelInputForGPUWithSamplingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        sampling_metadata=sampling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithSamplingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # For sampling metadata, only selected_token_indices is copied.
+    assert (received_model_input.sampling_metadata.selected_token_indices ==
+            sampling_metadata.selected_token_indices)
+    assert received_model_input.sampling_metadata.seq_groups is None
+
+
+def test_embedding_model_runner_input():
+    pooling_metadata = PoolingMetadata(
+        seq_groups=[[0]],
+        seq_data={},
+        prompt_lens=[1],
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+    )
+    model_input = ModelInputForGPUWithPoolingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        pooling_metadata=pooling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithPoolingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # Pooling metadata is not broadcast.
+    assert received_model_input.pooling_metadata is None
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index dd0d3bf50..e1775790c 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -61,12 +61,13 @@ def test_prepare_prompt(batch_size):
         expected_selected_token_indices.append(selected_token_start_idx +
                                                seq_len - 1)
         selected_token_start_idx += seq_len
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
     input_tokens = model_input.input_tokens
     input_positions = model_input.input_positions
     attn_metadata = model_input.attn_metadata
     return_seq_lens = model_input.seq_lens
-    slot_mapping = model_input.slot_mapping
+    slot_mapping = attn_metadata.slot_mapping
     assert return_seq_lens == seq_lens
     assert len(slot_mapping) == len(input_tokens)
 
@@ -174,10 +175,11 @@ def test_prepare_decode_cuda_graph(batch_size):
         assert seq_group_metadata.token_chunk_size == 1
         seq_group_metadata_list.append(seq_group_metadata)
 
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
     input_tokens, input_positions, attn_metadata, slot_mapping = (
         model_input.input_tokens, model_input.input_positions,
-        model_input.attn_metadata, model_input.slot_mapping)
+        model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
     assert len(slot_mapping) == len(input_tokens)
 
     expected_bs = _get_graph_batch_size(len(seq_group_metadata_list))
@@ -259,32 +261,29 @@ def test_empty_seq_group():
         enforce_eager=False,
     )
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
-    input_tokens, input_positions, attn_metadata, slot_mapping = (
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    input_tokens, input_positions, attn_metadata = (
         model_input.input_tokens,
         model_input.input_positions,
         model_input.attn_metadata,
-        model_input.slot_mapping,
     )
-    assert len(input_tokens) == 0
-    assert len(input_positions) == 0
+    assert input_tokens is None
+    assert input_positions is None
     assert attn_metadata is None
-    assert len(slot_mapping) == 0
-
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
-    (input_tokens, input_positions, attn_metadata, slot_mapping,
-     return_seq_lens) = (
-         model_input.input_tokens,
-         model_input.input_positions,
-         model_input.attn_metadata,
-         model_input.slot_mapping,
-         model_input.seq_lens,
-     )
-    assert len(input_tokens) == 0
-    assert len(input_positions) == 0
+
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata, return_seq_lens) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+        model_input.seq_lens,
+    )
+    assert input_tokens is None
+    assert input_positions is None
     assert attn_metadata is None
-    assert len(slot_mapping) == 0
-    assert len(return_seq_lens) == 0
+    assert return_seq_lens is None
 
 
 @pytest.fixture
@@ -353,8 +352,12 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
         seq_group_metadata_list.append(seq_group_metadata)
         decode_metadata_list.append(seq_group_metadata)
 
-    (input_tokens, input_positions, attn_metadata, _, _, _,
-     _) = model_runner.prepare_input_tensors(seq_group_metadata_list)
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+    )
 
     prefill_meta_actual = attn_metadata.prefill_metadata
     decode_meta_actual = attn_metadata.decode_metadata
@@ -367,7 +370,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
 
     # Verify attn metadata is consistent. We don't need to test individual
     # values here because they are tested above.
-    attn_metadata = model_runner._prepare_model_input(
+    attn_metadata = model_runner._prepare_model_input_tensors(
         seq_group_metadata_list).attn_metadata
 
     for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 6396103bf..40768532f 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -21,9 +21,13 @@ class AttentionBackend(ABC):
 
     @staticmethod
     @abstractmethod
-    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
         raise NotImplementedError
 
+    @classmethod
+    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
+        return cls.get_metadata_cls()(*args, **kwargs)
+
     @staticmethod
     @abstractmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index dce2b8361..7b4578fcd 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -90,8 +90,8 @@ class BlocksparseFlashAttentionBackend(AttentionBackend):
         return BlocksparseFlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "BlocksparseFlashAttentionMetadata":
-        return BlocksparseFlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return BlocksparseFlashAttentionMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 1c48e2a0b..8cb5c3101 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -25,8 +25,8 @@ class FlashAttentionBackend(AttentionBackend):
         return FlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "FlashAttentionMetadata":
-        return FlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashAttentionMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 7b7959d25..535d30b55 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -22,8 +22,8 @@ class FlashInferBackend(AttentionBackend):
         return FlashInferImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "FlashInferMetadata":
-        return FlashInferMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashInferMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index f09b24f2a..5114bfa6e 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -25,8 +25,8 @@ class IpexAttnBackend(AttentionBackend):
         return IpexAttnBackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "IpexAttnMetadata":
-        return IpexAttnMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["IpexAttnMetadata"]:
+        return IpexAttnMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index b203c5ec5..62b4a144f 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -16,8 +16,8 @@ class PallasAttentionBackend(AttentionBackend):
         return PallasAttentionBackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "PallasMetadata":
-        return PallasMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["PallasMetadata"]:
+        return PallasMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 9294068c6..81fabdbdf 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -25,8 +25,8 @@ class ROCmFlashAttentionBackend(AttentionBackend):
         return ROCmFlashAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "ROCmFlashAttentionMetadata":
-        return ROCmFlashAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return ROCmFlashAttentionMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c01e0a0a3..63f8466da 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -31,8 +31,8 @@ class TorchSDPABackend(AttentionBackend):
         return TorchSDPABackendImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "TorchSDPAMetadata":
-        return TorchSDPAMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return TorchSDPAMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 0fecd9f6e..ff449c3ff 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -28,8 +28,8 @@ class XFormersBackend(AttentionBackend):
         return XFormersImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "XFormersMetadata":
-        return XFormersMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return XFormersMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
index 235b5bc47..d8693e636 100644
--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@@ -64,8 +64,8 @@ class DistributedGPUExecutor(GPUExecutor):
                           num_cpu_blocks=num_cpu_blocks)
 
     def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[SamplerOutput]]:
         if self.parallel_worker_tasks is None:
             self.parallel_worker_tasks = self._run_workers(
                 "start_worker_execution_loop",
@@ -79,7 +79,7 @@ class DistributedGPUExecutor(GPUExecutor):
         if self.parallel_worker_tasks is None:
             return
 
-        self._driver_execute_model()
+        self._driver_execute_model(execute_model_req=None)
         parallel_worker_tasks = self.parallel_worker_tasks
         self.parallel_worker_tasks = None
         # Ensure that workers exit model loop cleanly
@@ -123,13 +123,13 @@ class DistributedGPUExecutor(GPUExecutor):
 
     @abstractmethod
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
+        Passing None will cause the driver to stop the model execution loop
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
         """
         raise NotImplementedError
 
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 7c2520b5a..d7c19622e 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -69,8 +69,8 @@ class ExecutorBase(ABC):
 
     @abstractmethod
     def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[SamplerOutput]]:
         """Executes at least one model step on the given sequences."""
         raise NotImplementedError
 
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 0a654200e..5522b5322 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -87,7 +87,7 @@ class GPUExecutor(ExecutorBase):
 
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
         output = self.driver_worker.execute_model(execute_model_req)
         return output
 
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index a5b1d27f2..6aebb4702 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -78,16 +78,14 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
             worker_monitor.close()
 
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution
         loop running in each of the remote workers.
         """
-        return self.driver_worker.execute_model(
-            execute_model_req=execute_model_req)
+        return self.driver_worker.execute_model(execute_model_req)
 
     def _run_workers(
         self,
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 1a3329749..53107dada 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -55,8 +55,7 @@ class NeuronExecutor(ExecutorBase):
         assert execute_model_req.num_lookahead_slots == 0, (
             "lookahead not supported for Neuron backend.")
 
-        output = self.driver_worker.execute_model(
-            execute_model_req.seq_group_metadata_list)
+        output = self.driver_worker.execute_model(execute_model_req)
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index fc83c5528..faa500c2d 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -190,9 +190,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
                           max_parallel_loading_workers)
 
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 287e1b9df..0925d1546 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -887,7 +887,8 @@ class HiddenStates:
 
 @dataclass
 class ExecuteModelRequest:
-    """The model execution request."""
+    """The model execution request, containing CPU metadata only. The LLM
+    engine should create an instance of this class for each request batch."""
     # The sequence group metadata list.
     seq_group_metadata_list: List[SequenceGroupMetadata]
     # Blocks to swap in. List of CPU -> GPU block number.
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index 0926e13be..6c1c8da57 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -7,7 +7,6 @@ from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
                            SequenceGroupMetadata)
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
-from vllm.worker.model_runner import ModelInput
 
 
 class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker):
@@ -56,7 +55,7 @@ class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker):
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, List[int], List[int]]:
         if not seq_group_metadata_list:
-            return ModelInput.empty(self.device)
+            return torch.empty(0, device=self.device), [], []
 
         input_tokens: List[int] = []
         seq_lens: List[int] = []
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index d539f5693..e3464c0d3 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -8,20 +9,64 @@ from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = -1
 
 
-class CPUModelRunner:
+@dataclass(frozen=True)
+class CPUModelInput(ModelRunnerInputBase):
+    """
+    Used by the CPUModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+            cls: Type["CPUModelInput"],
+            tensor_dict: Dict[str, Any],
+            attn_backend: Optional["AttentionBackend"] = None
+    ) -> "CPUModelInput":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
 
     def __init__(
         self,
@@ -270,86 +315,70 @@ class CPUModelRunner:
             attn_metadata,
         )
 
-    def prepare_input_tensors(
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> CPUModelInput:
+        return CPUModelInput.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
+    def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[Dict[str, torch.Tensor]]]:
+    ) -> CPUModelInput:
         multi_modal_kwargs = None
-        if self.is_driver_worker:
-            # NOTE: We assume that all sequences in the group are all prompts or
-            # all decodes.
-            is_prompt = seq_group_metadata_list[0].is_prompt
-            # Prepare input tensors.
-            if is_prompt:
-                (input_tokens, input_positions, attn_metadata, seq_lens,
-                 multi_modal_kwargs
-                 ) = self._prepare_prompt(seq_group_metadata_list)
-            else:
-                (input_tokens, input_positions,
-                 attn_metadata) = self._prepare_decode(seq_group_metadata_list)
-                seq_lens = []
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list,
-                seq_lens,
-                # query_lens is not needed if chunked prefill is not
-                # supported. Since CPU worker doesn't support chunked prefill
-                # just use seq_lens instead.
-                seq_lens,
-                self.device,
-                pin_memory=False)
-            # Broadcast the metadata.
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "selected_token_indices":
-                sampling_metadata.selected_token_indices,
-            }
-            metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, attn_metadata, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
         else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            selected_token_indices = metadata_dict.pop(
-                "selected_token_indices")
-            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
-            sampling_metadata = SamplingMetadata(
-                seq_groups=None,
-                seq_data=None,
-                seq_lens=None,
-                selected_token_indices=selected_token_indices,
-                categorized_sample_indices=None,
-                generators=None,
-            )
-
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, multi_modal_kwargs)
+            (input_tokens, input_positions,
+             attn_metadata) = self._prepare_decode(seq_group_metadata_list)
+            seq_lens = []
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
+            # supported. Since CPU worker doesn't support chunked prefill
+            # just use seq_lens instead.
+            seq_lens,
+            self.device,
+            pin_memory=False)
+        return CPUModelInput(
+            input_tokens=input_tokens,
+            input_positions=input_positions,
+            attn_metadata=attn_metadata,
+            sampling_metadata=sampling_metadata,
+        )
 
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: CPUModelInput,
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
-
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
         }
-        if self.vision_language_config and multi_modal_input is not None:
-            execute_model_kwargs.update(multi_modal_input)
+        if (self.vision_language_config
+                and model_input.multi_modal_kwargs is not None):
+            execute_model_kwargs.update(model_input.multi_modal_kwargs)
 
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
@@ -358,6 +387,6 @@ class CPUModelRunner:
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
         return output
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 914df0c7d..30ee262c7 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,5 @@
 """A CPU worker class."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -8,15 +8,15 @@ from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import (broadcast_tensor_dict,
-                              ensure_model_parallel_initialized,
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.worker.cpu_model_runner import CPUModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
+                                     LoraNotSupportedWorkerBase, WorkerInput)
 
 logger = init_logger(__name__)
 
@@ -110,7 +110,7 @@ class CPUCacheEngine:
         return dtype_size * total
 
 
-class CPUWorker(LoraNotSupportedWorkerBase):
+class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a CPU socket.
 
     Each worker is associated with a single CPU socket. The worker is 
@@ -154,7 +154,7 @@ class CPUWorker(LoraNotSupportedWorkerBase):
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.model_runner = CPUModelRunner(
+        self.model_runner: CPUModelRunner = CPUModelRunner(
             model_config,
             parallel_config,
             scheduler_config,
@@ -255,54 +255,37 @@ class CPUWorker(LoraNotSupportedWorkerBase):
         for layer_cache in self.cpu_cache:
             layer_cache.fill_(0)
 
-    def cache_copy(
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+        return self.cpu_cache
+
+    def execute_worker(
         self,
-        blocks_to_copy: torch.Tensor,
+        worker_input: WorkerInput,
     ) -> None:
-        if blocks_to_copy.numel() > 0:
-            self.cache_engine.copy(blocks_to_copy)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine.copy(worker_input.blocks_to_copy)
 
     @torch.inference_mode()
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> List[SamplerOutput]:
-
-        if execute_model_req is None:
-            seq_group_metadata_list = None
-        else:
-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            num_seq_groups: int = len(seq_group_metadata_list)
-            assert execute_model_req is not None
-            blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
-                                          device="cpu",
-                                          dtype=torch.int64).view(-1, 2)
-            assert len(execute_model_req.blocks_to_swap_in) == 0
-            assert len(execute_model_req.blocks_to_swap_out) == 0
-            data: Dict[str, Any] = {
-                "num_seq_groups": num_seq_groups,
-                "blocks_to_copy": execute_model_req.blocks_to_copy,
-            }
-            broadcast_tensor_dict(data, src=0)
-        else:
-            data = broadcast_tensor_dict(src=0)
-            num_seq_groups = data["num_seq_groups"]
-            blocks_to_copy = data["blocks_to_copy"]
-
-        self.cache_copy(blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
-
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.cpu_cache)
-
-        # CPU worker only supports single-step execution.
-        return [output]
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        assert execute_model_req is not None
+        num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
+        blocks_to_copy = execute_model_req.blocks_to_copy
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device="cpu",
+                                      dtype=torch.int64).view(-1, 2)
+        assert len(execute_model_req.blocks_to_swap_in) == 0
+        assert len(execute_model_req.blocks_to_swap_out) == 0
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_copy=blocks_to_copy,
+        )
 
     def init_distributed_environment(self) -> None:
         """Initialize the distributed environment."""
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 465130d10..3c8dfa2c6 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -1,24 +1,32 @@
-from typing import Dict, List, Optional, Set, Tuple
+import dataclasses
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 
-from vllm.attention import AttentionMetadata
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata
-from vllm.worker.model_runner import ModelRunner
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelInputForGPU
 
 logger = init_logger(__name__)
 
 
-class EmbeddingModelRunner(ModelRunner):
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
+    """
+    Used by the EmbeddingModelRunner.
+    """
+    pooling_metadata: Optional["PoolingMetadata"] = None
+
+
+class EmbeddingModelRunner(
+        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
+    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
+        ModelInputForGPUWithPoolingMetadata)
 
     def __init__(
         self,
@@ -47,21 +55,22 @@ class EmbeddingModelRunner(ModelRunner):
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        model_input: ModelInputForGPUWithPoolingMetadata,
         kv_caches: List[torch.Tensor],
     ) -> Optional[PoolerOutput]:
-        (input_tokens, input_positions, attn_metadata, pooling_metadata,
-         lora_requests, lora_mapping, multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
-
         if self.lora_config:
-            self.set_active_loras(lora_requests, lora_mapping)
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
 
         # Currently cuda graph is only supported by the decode phase.
-        prefill_meta = attn_metadata.prefill_metadata
-        decode_meta = attn_metadata.decode_metadata
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        decode_meta = model_input.attn_metadata.decode_metadata
         if prefill_meta is None and decode_meta.use_cuda_graph:
-            graph_batch_size = input_tokens.shape[0]
+            assert model_input.input_tokens is not None
+            graph_batch_size = model_input.input_tokens.shape[0]
             model_executable = self.graph_runners[graph_batch_size]
         else:
             model_executable = self.model
@@ -70,13 +79,14 @@ class EmbeddingModelRunner(ModelRunner):
         kv_caches = [None] * num_layers
 
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
         }
         if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
+            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+            execute_model_kwargs.update({"image_input": multi_modal_kwargs})
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
@@ -84,66 +94,31 @@ class EmbeddingModelRunner(ModelRunner):
             return None
 
         return self.model.pooler(hidden_states=hidden_states,
-                                 pooling_metadata=pooling_metadata)
+                                 pooling_metadata=model_input.pooling_metadata)
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self,
+            tensor_dict: Dict[str,
+                              Any]) -> ModelInputForGPUWithPoolingMetadata:
+        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
 
-    def prepare_input_tensors(
+    def prepare_model_input(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            # Prepare input tensors.
-            (
-                input_tokens,
-                input_positions,
-                attn_metadata,
-                seq_lens,
-                _,
-                lora_mapping,
-                lora_requests,
-                multi_modal_kwargs,
-                slot_mapping,
-                num_prefill_tokens,
-                num_decode_tokens,
-                num_prefills,
-            ) = self._prepare_model_input(seq_group_metadata_list)
-            # Prepare PoolingMetadata
-            pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                     seq_lens)
-
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "lora_requests": lora_requests,
-                "lora_mapping": lora_mapping,
-                "multi_modal_kwargs": multi_modal_kwargs,
-                "num_prefill_tokens": num_prefill_tokens,
-                "num_decode_tokens": num_decode_tokens,
-                "slot_mapping": slot_mapping,
-                "num_prefills": num_prefills,
-            }
-            if attn_metadata:
-                metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
-        else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            lora_mapping = metadata_dict.pop("lora_mapping")
-            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
-            if metadata_dict:
-                attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
-            else:
-                attn_metadata = None
-            pooling_metadata = PoolingMetadata(seq_groups=None,
-                                               seq_data=None,
-                                               prompt_lens=None)
-
-        return (input_tokens, input_positions, attn_metadata, pooling_metadata,
-                lora_requests, lora_mapping, multi_modal_kwargs)
+    ) -> ModelInputForGPUWithPoolingMetadata:
+        assert seq_group_metadata_list is not None
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list)
+        # Prepare PoolingMetadata.
+        assert model_input.seq_lens is not None
+        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
+                                                 model_input.seq_lens)
+
+        return dataclasses.replace(model_input,
+                                   pooling_metadata=pooling_metadata)
 
     def _prepare_pooling(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a321eafce..9fdb2ea5d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,8 +1,10 @@
+import dataclasses
 import gc
 import time
 import warnings
 from collections import defaultdict
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -12,7 +14,6 @@ from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
 from vllm.distributed.parallel_state import graph_capture
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
@@ -26,6 +27,15 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                         is_pin_memory_available, make_tensor_with_pad)
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
@@ -39,40 +49,90 @@ _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
 ]
 _NUM_WARMUP_ITERS = 2
 
+TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
 
-class ModelInput(NamedTuple):
-    input_tokens: torch.Tensor
-    input_positions: torch.Tensor
-    attn_metadata: Optional[AttentionMetadata]
-    seq_lens: List[int]
-    query_lens: List[int]
-    lora_mapping: Optional[LoRAMapping]
-    lora_requests: Set[LoRARequest]
-    multi_modal_kwargs: Dict[str, torch.Tensor]
-    slot_mapping: torch.Tensor
-    num_prefill_tokens: int
-    num_decode_tokens: int
-    num_prefills: int
 
-    @classmethod
-    def empty(cls, device):
-        return ModelInput(
-            input_tokens=torch.empty(0, device=device),
-            input_positions=torch.empty(0, device=device),
-            attn_metadata=None,
-            seq_lens=[],
-            query_lens=[],
-            lora_mapping=None,
-            lora_requests=set(),
-            multi_modal_kwargs={},
-            slot_mapping=torch.empty(0, device=device),
-            num_prefill_tokens=0,
-            num_decode_tokens=0,
-            num_prefills=0,
-        )
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPU(ModelRunnerInputBase):
+    """
+    This base class contains metadata needed for the base model forward pass
+    but not metadata for possible additional steps, e.g., sampling. Model
+    runners that run additional steps should subclass this method to add
+    additional fields.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    seq_lens: Optional[List[int]] = None
+    query_lens: Optional[List[int]] = None
+    lora_mapping: Optional["LoRAMapping"] = None
+    lora_requests: Optional[Set[LoRARequest]] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        return tensor_dict
 
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[TModelInputForGPU],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> TModelInputForGPU:
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
+    """
+    Used by the ModelRunner.
+    """
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    # Used for speculative decoding. We do not broadcast it because it is only
+    # used by the driver worker.
+    is_prompt: Optional[bool] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
 
-class ModelRunner:
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForGPUWithSamplingMetadata":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
+    """
+    Helper class for shared methods between GPU model runners.
+    """
+    _model_input_cls: Type[TModelInputForGPU]
 
     def __init__(
         self,
@@ -241,11 +301,13 @@ class ModelRunner:
         block_size = self.block_size
         return (self.max_seq_len_to_capture + block_size - 1) // block_size
 
-    def _prepare_model_input(
+    def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> ModelInput:
-        """Prepare the model input based on a given sequence group.
+    ) -> TModelInputForGPU:
+        """Helper method to prepare the model input based on a given sequence
+        group. Prepares metadata needed for the base model forward pass but not
+        metadata for possible additional steps, e.g., sampling.
 
         The API assumes seq_group_metadata_list is sorted by prefill -> decode.
 
@@ -296,7 +358,7 @@ class ModelRunner:
         paged_kv_last_page_len: List[int] = []
 
         if len(seq_group_metadata_list) == 0:
-            return ModelInput.empty(self.device)
+            return self._model_input_cls()
 
         if self.sliding_window is not None:
             sliding_window_blocks = (self.sliding_window + self.block_size -
@@ -646,7 +708,7 @@ class ModelRunner:
             for k, v in multi_modal_kwargs_list.items()
         }
 
-        return ModelInput(
+        return self._model_input_cls(
             input_tokens=input_tokens_tensor,
             input_positions=input_positions_tensor,
             attn_metadata=attn_metadata,
@@ -655,132 +717,8 @@ class ModelRunner:
             lora_mapping=lora_mapping,
             lora_requests=lora_requests,
             multi_modal_kwargs=multi_modal_kwargs,
-            slot_mapping=slot_mapping_tensor,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            num_prefills=num_prefills,
-        )
-
-    def prepare_input_tensors(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            # Prepare input tensors.
-            (
-                input_tokens,
-                input_positions,
-                attn_metadata,
-                seq_lens,
-                query_lens,
-                lora_mapping,
-                lora_requests,
-                multi_modal_kwargs,
-                slot_mapping,
-                num_prefill_tokens,
-                num_decode_tokens,
-                num_prefills,
-            ) = self._prepare_model_input(seq_group_metadata_list)
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list, seq_lens, query_lens, self.device,
-                self.pin_memory)
-
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "selected_token_indices":
-                sampling_metadata.selected_token_indices,
-                "lora_requests": lora_requests,
-                "lora_mapping": lora_mapping,
-                "multi_modal_kwargs": multi_modal_kwargs,
-                "num_prefill_tokens": num_prefill_tokens,
-                "num_decode_tokens": num_decode_tokens,
-                "slot_mapping": slot_mapping,
-                "num_prefills": num_prefills,
-            }
-            if attn_metadata:
-                metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
-        else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            selected_token_indices = metadata_dict.pop(
-                "selected_token_indices")
-            lora_mapping = metadata_dict.pop("lora_mapping")
-            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
-            if metadata_dict:
-                attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
-            else:
-                attn_metadata = None
-            sampling_metadata = SamplingMetadata(
-                seq_groups=None,
-                selected_token_indices=selected_token_indices,
-                categorized_sample_indices=None,
-                num_prompts=0,
-            )
-
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, lora_requests, lora_mapping,
-                multi_modal_kwargs)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests, lora_mapping, multi_modal_kwargs
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
-
-        if self.lora_config:
-            self.set_active_loras(lora_requests, lora_mapping)
-
-        # Currently cuda graph is only supported by the decode phase.
-        prefill_meta = attn_metadata.prefill_metadata
-        decode_meta = attn_metadata.decode_metadata
-        if prefill_meta is None and decode_meta.use_cuda_graph:
-            graph_batch_size = input_tokens.shape[0]
-            model_executable = self.graph_runners[graph_batch_size]
-        else:
-            model_executable = self.model
-
-        hidden_states = model_executable(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            **multi_modal_kwargs,
-        )
-
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
-
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return None
-
-        # Sample the next token.
-        output: SamplerOutput = self.model.sample(
-            logits=logits,
-            sampling_metadata=sampling_metadata,
         )
 
-        if self.return_hidden_states:
-            # we only need to pass hidden states of most recent token
-            assert seq_group_metadata_list is not None
-            if seq_group_metadata_list[0].is_prompt:
-                hidden_states = hidden_states.index_select(
-                    0, sampling_metadata.selected_token_indices)
-            output.hidden_states = hidden_states
-
-        return output
-
     @torch.inference_mode()
     def profile_run(self) -> None:
         # Enable top-k sampling to reflect the accurate memory usage.
@@ -853,7 +791,8 @@ class ModelRunner:
         # Run the model with the dummy inputs.
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        self.execute_model(seqs, kv_caches)
+        model_input = self.prepare_model_input(seqs)
+        self.execute_model(model_input, kv_caches)
         torch.cuda.synchronize()
         return
 
@@ -986,6 +925,110 @@ class ModelRunner:
         return self.model_config.get_vocab_size()
 
 
+class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
+    """
+    GPU model runner with sampling step.
+    """
+    _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
+        ModelInputForGPUWithSamplingMetadata)
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        return (
+            ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+                tensor_dict,
+                attn_backend=self.attn_backend,
+            ))
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """Prepare the model input based on a given sequence group, including
+        metadata for the sampling step.
+
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+
+        If cuda graph is required, this API automatically pads inputs.
+        """
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list)
+        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
+                                                     model_input.seq_lens,
+                                                     model_input.query_lens,
+                                                     self.device,
+                                                     self.pin_memory)
+        is_prompt = (seq_group_metadata_list[0].is_prompt
+                     if seq_group_metadata_list else None)
+        return dataclasses.replace(model_input,
+                                   sampling_metadata=sampling_metadata,
+                                   is_prompt=is_prompt)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+    ) -> SamplerOutput:
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        # Currently cuda graph is only supported by the decode phase.
+        assert model_input.attn_metadata is not None
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        decode_meta = model_input.attn_metadata.decode_metadata
+        if prefill_meta is None and decode_meta.use_cuda_graph:
+            assert model_input.input_tokens is not None
+            graph_batch_size = model_input.input_tokens.shape[0]
+            model_executable = self.graph_runners[graph_batch_size]
+        else:
+            model_executable = self.model
+
+        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        hidden_states = model_executable(
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=model_input.attn_metadata,
+            **multi_modal_kwargs,
+        )
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
+
+        # Only perform sampling in the driver worker.
+        if not self.is_driver_worker:
+            return None
+
+        # Sample the next token.
+        output: SamplerOutput = self.model.sample(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+
+        if self.return_hidden_states:
+            # we only need to pass hidden states of most recent token
+            if model_input.is_prompt:
+                assert model_input.sampling_metadata is not None
+                hidden_states = hidden_states.index_select(
+                    0, model_input.sampling_metadata.selected_token_indices)
+            output.hidden_states = hidden_states
+
+        return output
+
+
 class CUDAGraphRunner:
 
     def __init__(self, model: nn.Module):
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
new file mode 100644
index 000000000..9b1706035
--- /dev/null
+++ b/vllm/worker/model_runner_base.py
@@ -0,0 +1,157 @@
+import dataclasses
+from abc import ABC, abstractmethod
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
+                    TypeVar)
+
+import torch
+
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+
+if TYPE_CHECKING:
+    from vllm.attention import AttentionMetadata
+    from vllm.attention.backends.abstract import AttentionBackend
+    from vllm.model_executor import SamplingMetadata
+
+T = TypeVar('T', bound="ModelRunnerInputBase")
+
+
+def _add_attn_metadata_broadcastable_dict(
+        tensor_dict: Dict[str, Any],
+        attn_metadata: Optional["AttentionMetadata"]) -> None:
+    """
+    Helper method to update tensor_dict with broadcastable
+    AttentionMetadata fields.
+    """
+    if attn_metadata is not None:
+        tensor_dict.update(attn_metadata.asdict_zerocopy())
+
+
+def _init_attn_metadata_from_tensor_dict(
+    attn_backend: "AttentionBackend",
+    tensor_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """
+    Helper method to initialize AttentionMetadata based on an
+    AttentionBackend and broadcastable AttentionMetadata fields.
+    """
+    # Extract the fields used to create AttentionMetadata.
+    valid_attn_kwargs = {}
+    for field in dataclasses.fields(attn_backend.get_metadata_cls()):
+        val = tensor_dict.pop(field.name, None)
+        if val is not None:
+            valid_attn_kwargs[field.name] = val
+
+    attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
+    tensor_dict["attn_metadata"] = attn_metadata
+    return tensor_dict
+
+
+def _init_sampling_metadata_from_tensor_dict(  # type: ignore
+        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Helper method to initialize SamplingMetadata based on broadcastable
+    SamplingMetadata fields.
+    """
+    from vllm.model_executor import SamplingMetadata
+
+    selected_token_indices = tensor_dict.pop("selected_token_indices", None)
+    # An empty SamplingMetadata to signal that the worker should skip
+    # sampling.
+    if selected_token_indices is not None:
+        tensor_dict["sampling_metadata"] = SamplingMetadata(
+            seq_groups=None,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=None,
+            num_prompts=0,
+        )
+    return tensor_dict
+
+
+def _add_sampling_metadata_broadcastable_dict(
+        tensor_dict: Dict[str, Any],
+        sampling_metadata: Optional["SamplingMetadata"]) -> None:
+    """
+    Helper method to update tensor_dict with broadcastable
+    SamplingMetadata fields.
+    """
+    if sampling_metadata is not None:
+        tensor_dict["selected_token_indices"] = (
+            sampling_metadata.selected_token_indices)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelRunnerInputBase(ABC):
+    """Local inputs to each worker's model runner. May contain
+    device-specific data. Different worker backends may have different methods
+    of converting from the global ExecuteModelRequest produced by the LLM
+    engine to the worker-local ModelRunnerInputBase objects.
+
+    Model runners that support multi-GPU execution should define a
+    ModelRunnerInputBase subclass, add their required fields, and specify how to
+    serialize/deserialize a ModelInput for broadcast between workers.
+    """
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        """
+        Extract broadcastable fields. Override for fields that require some
+        custom deserialization.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[T],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> T:
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        ModelRunnerInputBase.
+        """
+        raise NotImplementedError
+
+
+class ModelRunnerBase(ABC, Generic[T]):
+    """
+    Model runner interface that abstracts a particular hardware and/or type of
+    model. Model execution may communicate data with model runners in other
+    processes, but it should not include control plane metadata communication.
+
+    Each ModelRunnerBase subclass should define a corresponding
+    ModelRunnerInputBase subclass.
+    """
+
+    @abstractmethod
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> T:
+        """
+        Make an instance of a ModelRunnerInputBase from the broadcasted tensor
+        dict.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> T:
+        """
+        Prepare the inputs to ModelRunnerBase.execute_model from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: T,
+        kv_caches: Optional[List[torch.Tensor]],
+    ) -> Optional[SamplerOutput]:
+        """
+        Execute the model on the given input.
+        """
+        raise NotImplementedError
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index a336be04e..fec2c97e7 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -10,11 +11,39 @@ from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.neuron import get_neuron_model
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
 
-class NeuronModelRunner:
+@dataclass(frozen=True)
+class ModelInputForNeuron(ModelRunnerInputBase):
+    """
+    Used by the NeuronModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    input_block_ids: Optional[torch.Tensor] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        raise NotImplementedError("ModelInputForNeuron cannot be broadcast.")
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForNeuron":
+        assert attn_backend is None
+        return cls.from_broadcasted_tensor_dict(tensor_dict)
+
+
+class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
 
     def __init__(
         self,
@@ -139,10 +168,14 @@ class NeuronModelRunner:
 
         return input_tokens, input_positions, input_block_ids
 
-    def prepare_input_tensors(
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
+        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
+
+    def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
+    ) -> ModelInputForNeuron:
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
         is_prompt = seq_group_metadata_list[0].is_prompt
@@ -164,30 +197,31 @@ class NeuronModelRunner:
             self.device,
             self.pin_memory)
 
-        return (input_tokens, input_positions, input_block_ids,
-                sampling_metadata)
+        return ModelInputForNeuron(input_tokens=input_tokens,
+                                   input_positions=input_positions,
+                                   input_block_ids=input_block_ids,
+                                   sampling_metadata=sampling_metadata)
 
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: ModelInputForNeuron,
+        kv_caches: Optional[List[torch.Tensor]] = None,
     ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, input_block_ids, sampling_metadata
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
-
         hidden_states = self.model(
-            input_ids=input_tokens,
-            positions=input_positions,
-            input_block_ids=input_block_ids,
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            input_block_ids=model_input.input_block_ids,
         )
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
         return output
 
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index d0e6aaed1..307c107dd 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,5 +1,5 @@
 """A Neuron worker class."""
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -7,12 +7,13 @@ import torch.distributed
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig)
 from vllm.model_executor import set_random_seed
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
+                                     LoraNotSupportedWorkerBase, WorkerInput)
 
 
-class NeuronWorker(LoraNotSupportedWorkerBase):
+class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     """A worker class that executes the model on a group of neuron cores.
     """
 
@@ -34,8 +35,9 @@ class NeuronWorker(LoraNotSupportedWorkerBase):
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        self.model_runner = NeuronModelRunner(model_config, parallel_config,
-                                              scheduler_config, device_config)
+        self.model_runner: NeuronModelRunner = NeuronModelRunner(
+            model_config, parallel_config, scheduler_config, device_config)
+        self.is_driver_worker = True
 
     def init_device(self) -> None:
         # Set random seed.
@@ -73,22 +75,19 @@ class NeuronWorker(LoraNotSupportedWorkerBase):
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> List[SamplerOutput]:
-        num_seq_groups = len(seq_group_metadata_list)
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return False
 
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
+    @property
+    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+        return None
 
-        output = self.model_runner.execute_model(seq_group_metadata_list)
-
-        # Neuron worker only supports single-step output. Wrap the output in a
-        # list to conform to interface.
-        return [output]
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        return WorkerInput(num_seq_groups=len(
+            execute_model_req.seq_group_metadata_list), )
 
     def get_cache_block_size_bytes(self) -> int:
         """Determine the size in bytes of a cache block.
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index c60764ef1..e1944a4f1 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,7 +1,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import List, Optional, Set, Tuple, Type
 
 import torch
 import torch.distributed
@@ -9,21 +9,20 @@ import torch.distributed
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          SpeculativeConfig, VisionLanguageConfig)
-from vllm.distributed import (broadcast_tensor_dict,
-                              ensure_model_parallel_initialized,
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
-from vllm.worker.model_runner import ModelRunner
-from vllm.worker.worker_base import WorkerBase
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
+from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
 
-class Worker(WorkerBase):
+class Worker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a GPU.
 
     Each worker is associated with a single GPU. The worker is responsible for
@@ -78,9 +77,10 @@ class Worker(WorkerBase):
               or (speculative_config.draft_model_config.hf_config.model_type !=
                   "mlp_speculator") else {"return_hidden_states": True}
 
-        ModelRunnerClass = (EmbeddingModelRunner if
-                            self.model_config.embedding_mode else ModelRunner)
-        self.model_runner = ModelRunnerClass(
+        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
+        if self.model_config.embedding_mode:
+            ModelRunnerClass = EmbeddingModelRunner
+        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
             model_config,
             parallel_config,
             scheduler_config,
@@ -225,40 +225,18 @@ class Worker(WorkerBase):
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
-    def cache_swap(
-        self,
-        blocks_to_swap_in: torch.Tensor,
-        blocks_to_swap_out: torch.Tensor,
-        blocks_to_copy: torch.Tensor,
-    ) -> None:
-        # Issue cache operations.
-        if blocks_to_swap_in.numel() > 0:
-            self.cache_engine.swap_in(blocks_to_swap_in)
-        if blocks_to_swap_out.numel() > 0:
-            self.cache_engine.swap_out(blocks_to_swap_out)
-        if blocks_to_copy.numel() > 0:
-            self.cache_engine.copy(blocks_to_copy)
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+        return self.gpu_cache
 
     @torch.inference_mode()
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
-        if not self.is_driver_worker:
-            self._execute_model_non_driver()
-            return []
-
-        if execute_model_req is None:
-            # This signals that there's no more requests to process for now.
-            # All workers are running infinite loop with broadcast_tensor_dict,
-            # and it stops the loop when the driver broadcasts an empty input.
-            # Send an empty input to notify all other workers to stop their
-            # execution loop.
-            broadcast_tensor_dict({}, src=0)
-            return []
-
-        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-        num_seq_groups = len(seq_group_metadata_list)
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
         # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
         # they contain parameters to launch cudamemcpyasync.
         blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
@@ -273,59 +251,26 @@ class Worker(WorkerBase):
         blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                       device=self.device,
                                       dtype=torch.int64).view(-1, 2)
-        data: Dict[str, Any] = {
-            "num_seq_groups": num_seq_groups,
-            "blocks_to_swap_in": blocks_to_swap_in,
-            "blocks_to_swap_out": blocks_to_swap_out,
-            "blocks_to_copy": blocks_to_copy,
-        }
-        broadcast_tensor_dict(data, src=0)
-
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
 
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.gpu_cache)
-
-        # Worker only supports single-step execution. Wrap the output in a list
-        # to conform to interface.
-        return [output]
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+        )
 
     @torch.inference_mode()
-    def start_worker_execution_loop(self) -> None:
-        """Execute model loop in parallel worker.
-
-        You can stop the loop by executing a driver worker with an empty output.
-        See `stop_remote_worker_execution_loop` for more details.
-        """
-        while self._execute_model_non_driver():
-            pass
-
-    def _execute_model_non_driver(self) -> bool:
-        """Execute model in parallel worker.
-
-        Returns True iff there are remaining sequences to process.
-        """
-        assert not self.is_driver_worker
-        data = broadcast_tensor_dict(src=0)
-        if not data:
-            return False
-
-        num_seq_groups = data.get("num_seq_groups", 0)
-        blocks_to_swap_in = data.get("blocks_to_swap_in")
-        blocks_to_swap_out = data.get("blocks_to_swap_out")
-        blocks_to_copy = data.get("blocks_to_copy")
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return False
-
-        self.model_runner.execute_model(None, self.gpu_cache)
-        return True
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine.swap_in(worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine.swap_out(worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine.copy(worker_input.blocks_to_copy)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 99482aa93..1df60eb1f 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,20 +1,26 @@
+import dataclasses
 import importlib
 import os
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 
+import torch
+
+from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (enable_trace_function_call_for_thread, is_hip,
                         update_environment_variables)
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
 
 logger = init_logger(__name__)
 
 
 class WorkerBase(ABC):
     """Worker interface that allows vLLM to cleanly separate implementations for
-    different hardware.
+    different hardware. Also abstracts control plane communication, e.g., to
+    communicate request metadata to other workers.
     """
 
     @abstractmethod
@@ -46,13 +52,23 @@ class WorkerBase(ABC):
         """
         raise NotImplementedError
 
+    @torch.inference_mode()
+    def start_worker_execution_loop(self) -> None:
+        """Execute model loop in parallel worker.
+
+        You can stop the loop by executing a driver worker with an empty output.
+        See `stop_remote_worker_execution_loop` for more details.
+        """
+        while True:
+            output = self.execute_model(execute_model_req=None)
+            if output is None:
+                return None
+
     @abstractmethod
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        """Executes at least one model step on the given sequences, unless no
-        sequences are provided."""
+    ) -> Optional[List[SamplerOutput]]:
         raise NotImplementedError
 
     @abstractmethod
@@ -98,6 +114,150 @@ class LoraNotSupportedWorkerBase(WorkerBase):
         raise ValueError(f"{type(self)} does not support LoRA")
 
 
+@dataclasses.dataclass(frozen=True)
+class WorkerInput:
+    """Local inputs to each worker. May contain device-specific data. These
+    fields should be broadcastable to other workers.
+    """
+
+    num_seq_groups: Optional[int] = None
+    blocks_to_swap_in: Optional[torch.Tensor] = None
+    blocks_to_swap_out: Optional[torch.Tensor] = None
+    blocks_to_copy: Optional[torch.Tensor] = None
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["WorkerInput"],
+        tensor_dict: Dict[str, Any],
+    ) -> "WorkerInput":
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        WorkerInput.
+        """
+        return cls(
+            num_seq_groups=tensor_dict.pop("num_seq_groups"),
+            blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
+            blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
+            blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
+        )
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        """
+        Extract broadcastable fields.
+        """
+        tensor_dict = {
+            "num_seq_groups": self.num_seq_groups,
+            "blocks_to_swap_in": self.blocks_to_swap_in,
+            "blocks_to_swap_out": self.blocks_to_swap_out,
+            "blocks_to_copy": self.blocks_to_copy,
+        }
+
+        return tensor_dict
+
+
+class LocalOrDistributedWorkerBase(WorkerBase):
+    """
+    Partial implementation of WorkerBase that has a default `execute_model`
+    definition to perform metadata transfer between workers when in distributed
+    mode. Subclasses of this interface should use model runners that inherit
+    from ModelRunnerBase, and should only need to implement worker-local logic.
+    If custom control plane logic is needed to transfer metadata, or if the
+    model runner cannot inherit from ModelRunnerBase, use WorkerBase instead.
+    """
+    is_driver_worker: bool
+    model_runner: ModelRunnerBase
+
+    @property
+    @abstractmethod
+    def do_metadata_broadcast(self) -> bool:
+        """
+        Used by the default `execute_model` to check whether broadcast is
+        needed to transfer request inputs from the driver worker to other
+        workers in the TP group. If WorkerBase subclass only supports
+        single-worker execution, then this method should return False.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+        """
+        Get the kv cache to pass to the worker's model runner. Used by the
+        default `execute_model`. If the worker's model runner does not follow
+        the ModelRunnerBase interface, then inherit from WorkerBase instead.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        """
+        Prepare the inputs to WorkerBase.execute_worker from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        """
+        Process an execution request.
+        """
+        raise NotImplementedError
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        """Executes at least one model step on the given sequences, unless no
+        sequences are provided."""
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    # This signals that there's no more requests to process for
+                    # now. All workers are running infinite loop with
+                    # broadcast_tensor_dict, and it stops the loop when the
+                    # driver broadcasts an empty input. Send an empty input to
+                    # notify all other workers to stop their execution loop.
+                    broadcast_tensor_dict({}, src=0)
+                return None
+
+            worker_input: WorkerInput = self.prepare_worker_input(
+                execute_model_req=execute_model_req)
+            model_input: ModelRunnerInputBase = (
+                self.model_runner.prepare_model_input(
+                    execute_model_req.seq_group_metadata_list))
+
+            if self.do_metadata_broadcast:
+                broadcast_data = worker_input.as_broadcastable_tensor_dict()
+                broadcast_data.update(
+                    model_input.as_broadcastable_tensor_dict())
+                broadcast_tensor_dict(broadcast_data, src=0)
+        else:
+            assert self.do_metadata_broadcast
+            broadcast_data = broadcast_tensor_dict(src=0)
+            if not broadcast_data:
+                return None
+
+            worker_input = WorkerInput.from_broadcasted_tensor_dict(
+                broadcast_data)
+            model_input = (
+                self.model_runner.
+                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
+
+        self.execute_worker(worker_input)
+
+        # If there is no input, we don't need to execute the model.
+        if worker_input.num_seq_groups == 0:
+            return []
+
+        output = self.model_runner.execute_model(model_input, self.kv_cache)
+        # Worker only supports single-step execution. Wrap the output in a
+        # list to conform to interface.
+        return [output]
+
+
 class WorkerWrapperBase:
     """
     The whole point of this class is to lazily initialize the worker.
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index f30de703e..d9124a788 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -14,6 +15,15 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 logger = init_logger(__name__)
 
@@ -24,7 +34,42 @@ _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
 ]
 
 
-class XPUModelRunner:
+@dataclass(frozen=True)
+class ModelInputForXPU(ModelRunnerInputBase):
+    """
+    Used by the NeuronModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_input: Optional[Dict[str, torch.Tensor]] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["ModelInputForXPU"],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForXPU":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
 
     def __init__(
         self,
@@ -130,15 +175,22 @@ class XPUModelRunner:
         # Run the model with the dummy inputs.
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        self.execute_model(seqs, kv_caches)
+        model_input = self.prepare_model_input(seqs)
+        self.execute_model(model_input, kv_caches)
         torch.xpu.synchronize()
         return
 
-    def prepare_input_tensors(
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForXPU:
+        return (ModelInputForXPU.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        ))
+
+    def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[torch.Tensor]]:
+    ) -> ModelInputForXPU:
         multi_modal_input = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or
@@ -185,8 +237,11 @@ class XPUModelRunner:
                 num_prompts=0,
             )
 
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, multi_modal_input)
+        return ModelInputForXPU(input_tokens=input_tokens,
+                                input_positions=input_positions,
+                                attn_metadata=attn_metadata,
+                                sampling_metadata=sampling_metadata,
+                                multi_modal_input=multi_modal_input)
 
     def _prepare_decode(
         self,
@@ -277,27 +332,25 @@ class XPUModelRunner:
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: ModelInputForXPU,
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
-
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
+            "input_ids": model_input.input_tokens,
+            "positions": model_input.input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": model_input.attn_metadata,
         }
         if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
+            execute_model_kwargs.update(
+                {"image_input": model_input.multi_modal_input})
 
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
@@ -306,7 +359,7 @@ class XPUModelRunner:
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
-            sampling_metadata=sampling_metadata,
+            sampling_metadata=model_input.sampling_metadata,
         )
         return output
 
-- 
GitLab


From 3aa7b6cf66890c042ebecf9e8094f4f5e3dbf96e Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 25 Jun 2024 20:34:25 -0700
Subject: [PATCH 153/376] [Misc][Doc] Add Example of using OpenAI Server with
 VLM (#5832)

---
 docs/source/models/vlm.rst           |  2 +
 examples/openai_vision_api_client.py | 90 ++++++++++++++++++++++++++++
 vllm/multimodal/utils.py             | 12 +++-
 3 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 examples/openai_vision_api_client.py

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index de55a1a09..1837dd2aa 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -130,6 +130,8 @@ To consume the server, you can use the OpenAI client like in the example below:
     )
     print("Chat response:", chat_response)
 
+A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
+
 .. note::
 
     By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
new file mode 100644
index 000000000..26f2aa651
--- /dev/null
+++ b/examples/openai_vision_api_client.py
@@ -0,0 +1,90 @@
+"""An example showing how to use vLLM to serve VLMs.
+
+Launch the vLLM server with the following command:
+python -m vllm.entrypoints.openai.api_server \
+    --model llava-hf/llava-1.5-7b-hf \
+    --image-input-type pixel_values \
+    --image-token-id 32000 \
+    --image-input-shape 1,3,336,336 \
+    --image-feature-size 576 \
+    --chat-template template_llava.jinja
+"""
+import base64
+
+import requests
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+# Use image url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What’s in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            },
+        ],
+    }],
+    model=model,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print(f"Chat completion output:{result}")
+
+
+# Use base64 encoded image in the payload
+def encode_image_base64_from_url(image_url: str) -> str:
+    """Encode an image retrieved from a remote url to base64 format."""
+
+    with requests.get(image_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+
+image_base64 = encode_image_base64_from_url(image_url=image_url)
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What’s in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_base64}"
+                },
+            },
+        ],
+    }],
+    model=model,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print(f"Chat completion output:{result}")
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 509f791d2..0cf2c057f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,7 @@
 import base64
 from io import BytesIO
 from typing import Optional, Union
+from urllib.parse import urlparse
 
 import aiohttp
 from PIL import Image
@@ -28,6 +29,10 @@ class ImageFetchAiohttp:
         """Load PIL image from a url or base64 encoded openai GPT4V format"""
 
         if image_url.startswith('http'):
+            parsed_url = urlparse(image_url)
+            if parsed_url.scheme not in ["http", "https"]:
+                raise ValueError("Invalid 'image_url': A valid 'image_url' "
+                                 "must have scheme 'http' or 'https'.")
             # Avoid circular import
             from vllm import __version__ as VLLM_VERSION
 
@@ -44,8 +49,9 @@ class ImageFetchAiohttp:
             image = load_image_from_base64(image_url.split(',', 1)[1])
 
         else:
-            raise ValueError("Invalid image url: A valid image url must start "
-                             "with either 'data:image' or 'http'.")
+            raise ValueError(
+                "Invalid 'image_url': A valid 'image_url' must start "
+                "with either 'data:image' or 'http'.")
 
         return image
 
@@ -56,7 +62,7 @@ async def async_get_and_parse_image(image_url: str) -> ImagePixelData:
 
 
 def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
-    """encode image to base64 format."""
+    """Encode a pillow image to base64 format."""
 
     buffered = BytesIO()
     if format == 'JPEG':
-- 
GitLab


From 515080ad2fd93cc8e363ff43b90a9df18cfd71ff Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 25 Jun 2024 21:56:02 -0700
Subject: [PATCH 154/376] [bugfix][distributed] fix shm broadcast when the
 queue size is full (#5801)

---
 tests/distributed/test_shm_broadcast.py       | 49 +++++++++----
 .../device_communicators/shm_broadcast.py     | 73 +++++++++++--------
 2 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index d92900ffc..2c2466f81 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -1,7 +1,9 @@
 import multiprocessing
 import random
 import time
+from typing import List
 
+import numpy as np
 import torch.distributed as dist
 
 from vllm.distributed.device_communicators.shm_broadcast import (
@@ -9,6 +11,14 @@ from vllm.distributed.device_communicators.shm_broadcast import (
 from vllm.utils import update_environment_variables
 
 
+def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+    np.random.seed(seed)
+    sizes = np.random.randint(1, 10_000, n)
+    # on average, each array will have 5k elements
+    # with int64, each array will have 40kb
+    return [np.random.randint(1, 100, i) for i in sizes]
+
+
 def distributed_run(fn, world_size):
     number_of_processes = world_size
     processes = []
@@ -47,24 +57,31 @@ def worker_fn_wrapper(fn):
 def worker_fn():
     writer_rank = 2
     broadcaster = ShmRingBufferIO.create_from_process_group(
-        dist.group.WORLD, 1024, 2, writer_rank)
+        dist.group.WORLD, 1024 * 1024, 2, writer_rank)
+    if dist.get_rank() == writer_rank:
+        seed = random.randint(0, 1000)
+        dist.broadcast_object_list([seed], writer_rank)
+    else:
+        recv = [None]
+        dist.broadcast_object_list(recv, writer_rank)
+        seed = recv[0]  # type: ignore
+    dist.barrier()
+    # in case we find a race condition
+    # print the seed so that we can reproduce the error
+    print(f"Rank {dist.get_rank()} got seed {seed}")
+    # test broadcasting with about 400MB of data
+    N = 10_000
     if dist.get_rank() == writer_rank:
-        time.sleep(random.random())
-        broadcaster.broadcast_object(0)
-        time.sleep(random.random())
-        broadcaster.broadcast_object({})
-        time.sleep(random.random())
-        broadcaster.broadcast_object([])
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            broadcaster.broadcast_object(x)
+            time.sleep(random.random() / 1000)
     else:
-        time.sleep(random.random())
-        a = broadcaster.broadcast_object(None)
-        time.sleep(random.random())
-        b = broadcaster.broadcast_object(None)
-        time.sleep(random.random())
-        c = broadcaster.broadcast_object(None)
-        assert a == 0
-        assert b == {}
-        assert c == []
+        arrs = get_arrays(N, seed)
+        for x in arrs:
+            y = broadcaster.broadcast_object(None)
+            assert np.array_equal(x, y)
+            time.sleep(random.random() / 1000)
     dist.barrier()
 
 
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index c44bd2f11..550271f88 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -14,6 +14,12 @@ from vllm.logger import init_logger
 
 VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 
+# time to wait if the queue is full or empty
+# if we sleep for too short, it will consume too much CPU
+# if we sleep for too long, it will slow down the writer/reader
+# 0.1 us is a good balance
+RINGBUFFER_SLEEP_INTERVAL = 1e-7
+
 logger = init_logger(__name__)
 
 
@@ -145,8 +151,7 @@ class ShmRingBufferIO:
     @contextmanager
     def acquire_write(self):
         assert self._is_writer, "Only writers can acquire write"
-        start_index = self.current_idx
-        start_time = time.time()
+        start_time = time.monotonic()
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
@@ -154,19 +159,21 @@ class ShmRingBufferIO:
                 written_flag = metadata_buffer[0]
                 if written_flag and read_count != self.buffer.n_reader:
                     # this block is written and not read by all readers
-                    # try to write to the next block
-                    self.current_idx = (self.current_idx +
-                                        1) % self.buffer.max_chunks
-                    if self.current_idx == start_index:
-                        # no empty block found
-                        if time.time(
-                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
-                            logger.warning(
-                                "No available block found in %s second. ",
-                                VLLM_RINGBUFFER_WARNING_INTERVAL)
-                            n_warning += 1
-                        # wait for a while (0.1 us)
-                        time.sleep(1e-7)
+                    # for writers, `self.current_idx` is the next block to write
+                    # if this block is not ready to write,
+                    # we need to wait until it is read by all readers
+
+                    # wait for a while
+                    time.sleep(RINGBUFFER_SLEEP_INTERVAL)
+
+                    # if we wait for a long time, we should warn the user
+                    if time.monotonic(
+                    ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        n_warning += 1
+
                     continue
                 # found a block that is either
                 # (1) not written
@@ -188,13 +195,14 @@ class ShmRingBufferIO:
                     metadata_buffer[i] = 0
                 # mark the block as written
                 metadata_buffer[0] = 1
+                self.current_idx = (self.current_idx +
+                                    1) % self.buffer.max_chunks
                 break
 
     @contextmanager
     def acquire_read(self):
         assert self._is_reader, "Only readers can acquire read"
-        start_index = self.current_idx
-        start_time = time.time()
+        start_time = time.monotonic()
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
@@ -204,19 +212,22 @@ class ShmRingBufferIO:
                     # this block is either
                     # (1) not written
                     # (2) already read by this reader
-                    # try to read the next block
-                    self.current_idx = (self.current_idx +
-                                        1) % self.buffer.max_chunks
-                    if self.current_idx == start_index:
-                        # no block found
-                        if time.time(
-                        ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
-                            logger.warning(
-                                "No available block found in %s second. ",
-                                VLLM_RINGBUFFER_WARNING_INTERVAL)
-                            n_warning += 1
-                        # wait for a while (0.1 us)
-                        time.sleep(1e-7)
+
+                    # for readers, `self.current_idx` is the next block to read
+                    # if this block is not ready,
+                    # we need to wait until it is written
+
+                    # wait for a while
+                    time.sleep(RINGBUFFER_SLEEP_INTERVAL)
+
+                    # if we wait for a long time, we should warn the user
+                    if time.monotonic(
+                    ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:  # noqa
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        n_warning += 1
+
                     continue
                 # found a block that is not read by this reader
                 # let caller read from the buffer
@@ -226,6 +237,8 @@ class ShmRingBufferIO:
                 # caller has read from the buffer
                 # set the read flag
                 metadata_buffer[self.reader_rank + 1] = 1
+                self.current_idx = (self.current_idx +
+                                    1) % self.buffer.max_chunks
                 break
 
     def enqueue(self, obj):
-- 
GitLab


From 6806998bf9c7f24d710d9017c901e9e9a30757d5 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Jun 2024 00:15:22 -0700
Subject: [PATCH 155/376] [Bugfix] Fix embedding to support 2D inputs (#5829)

---
 vllm/model_executor/layers/vocab_parallel_embedding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 1a26c5c63..4650b2c24 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -306,11 +306,11 @@ class VocabParallelEmbedding(torch.nn.Module):
                 self.shard_indices.added_vocab_end_index)
         else:
             masked_input = input_
-            # Get the embeddings.
+        # Get the embeddings.
         output_parallel = F.embedding(masked_input.long(), self.weight)
         # Mask the output embedding.
         if self.tp_size > 1:
-            output_parallel.masked_fill_(input_mask.unsqueeze(1), 0)
+            output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
         # Reduce across all the model parallel GPUs.
         output = tensor_model_parallel_all_reduce(output_parallel)
         return output
-- 
GitLab


From 3439c5a8e3a1cdab9bf7c4430455ace06be1f28d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Jun 2024 00:58:23 -0700
Subject: [PATCH 156/376] [Bugfix][TPU] Fix KV cache size calculation (#5860)

---
 vllm/worker/tpu_worker.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 828bb89d7..cd72c7119 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -118,14 +118,15 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         xm.wait_device_ops()
 
         m = xm.get_memory_info(self.device)
-        program_size = 1024 * 1024 * 1024  # 1GB
-        free_bytes = max(m["bytes_limit"] - m["bytes_used"] - program_size, 0)
-        kv_cache_bytes = int(free_bytes *
-                             self.cache_config.gpu_memory_utilization)
-        kv_cache_dtype_btyes = get_dtype_size(self.cache_dtype)
+        total_memory_size = m["bytes_limit"]
+        usable_memory_size = int(total_memory_size *
+                                 self.cache_config.gpu_memory_utilization)
+        profiled = m["bytes_used"]  # Weights + intermediate activations.
+        kv_cache_bytes = max(usable_memory_size - profiled, 0)
+        dtype_btyes = get_dtype_size(self.cache_dtype)
         block_size = self.cache_config.block_size
         num_tpu_blocks = (kv_cache_bytes //
-                          (kv_cache_dtype_btyes * block_size * num_layers * 2 *
+                          (dtype_btyes * block_size * num_layers * 2 *
                            head_size * num_kv_heads))
         num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.
         return num_tpu_blocks, 0
-- 
GitLab


From 6984c02a2735d4d08426d2c426c34b6d73bee89e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 16:02:34 +0800
Subject: [PATCH 157/376] [CI/Build] Refactor image test assets (#5821)

---
 tests/conftest.py                  | 111 ++++++++++++++++++-----------
 tests/models/test_llava.py         |  26 +++----
 tests/models/test_llava_next.py    |  30 ++++----
 tests/models/test_phi3v.py         |  28 ++++----
 tests/multimodal/test_processor.py |  24 +++----
 5 files changed, 127 insertions(+), 92 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 67885b932..9d00c7676 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,12 @@
 import contextlib
 import gc
 import os
-from typing import Any, Dict, List, Optional, Tuple, TypeVar
+from collections import UserList
+from dataclasses import dataclass
+from functools import cached_property
+from pathlib import Path
+from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
+                    TypeVar)
 
 import pytest
 import torch
@@ -28,21 +33,8 @@ _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 
-# Multi modal related
-# You can use `.buildkite/download-images.sh` to download the assets
-PIXEL_VALUES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
-]
-IMAGE_FEATURES_FILES = [
-    os.path.join(_TEST_DIR, "images", filename) for filename in
-    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
-]
-IMAGE_FILES = [
-    os.path.join(_TEST_DIR, "images", filename)
-    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
-]
-assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
+_IMAGE_DIR = Path(_TEST_DIR) / "images"
+"""You can use `.buildkite/download-images.sh` to download the assets."""
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -51,6 +43,63 @@ def _read_prompts(filename: str) -> List[str]:
         return prompts
 
 
+@dataclass(frozen=True)
+class ImageAsset:
+    name: Literal["stop_sign", "cherry_blossom"]
+
+    @cached_property
+    def pixel_values(self) -> torch.Tensor:
+        return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt")
+
+    @cached_property
+    def image_features(self) -> torch.Tensor:
+        return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt")
+
+    @cached_property
+    def pil_image(self) -> Image.Image:
+        return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
+
+    def for_hf(self) -> Image.Image:
+        return self.pil_image
+
+    def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
+        image_input_type = vision_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+
+        if image_input_type == ImageInputType.IMAGE_FEATURES:
+            return ImageFeatureData(self.image_features)
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            return ImagePixelData(self.pil_image)
+
+        raise NotImplementedError
+
+
+class _ImageAssetPrompts(TypedDict):
+    stop_sign: str
+    cherry_blossom: str
+
+
+class _ImageAssets(UserList[ImageAsset]):
+
+    def __init__(self) -> None:
+        super().__init__(
+            [ImageAsset("stop_sign"),
+             ImageAsset("cherry_blossom")])
+
+    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
+        """
+        Convenience method to define the prompt for each test image.
+
+        The order of the returned prompts matches the order of the
+        assets when iterating through this object.
+        """
+        return [prompts["stop_sign"], prompts["cherry_blossom"]]
+
+
+IMAGE_ASSETS = _ImageAssets()
+"""Singleton instance of :class:`_ImageAssets`."""
+
+
 def cleanup():
     destroy_model_parallel()
     destroy_distributed_environment()
@@ -81,31 +130,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
         cleanup()
 
 
-@pytest.fixture(scope="session")
-def hf_images() -> List[Image.Image]:
-    return [Image.open(filename) for filename in IMAGE_FILES]
-
-
-@pytest.fixture()
-def vllm_images(request) -> List[MultiModalData]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    if vision_language_config.image_input_type == (
-            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
-        return [
-            ImageFeatureData(torch.load(filename))
-            for filename in IMAGE_FEATURES_FILES
-        ]
-    else:
-        return [
-            ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
-        ]
-
-
-@pytest.fixture()
-def vllm_image_tensors(request) -> List[torch.Tensor]:
-    return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
-
-
 @pytest.fixture
 def example_prompts() -> List[str]:
     prompts = []
@@ -122,6 +146,11 @@ def example_long_prompts() -> List[str]:
     return prompts
 
 
+@pytest.fixture(scope="session")
+def image_assets() -> _ImageAssets:
+    return IMAGE_ASSETS
+
+
 _STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.half,
     "bfloat16": torch.bfloat16,
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index b41c69f72..ac1d2ece6 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -5,17 +5,17 @@ from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS
 
 pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
     "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
     "<image>\nUSER: What is the season?\nASSISTANT:",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+})
 
 
 def iter_llava_configs(model_name: str):
@@ -49,28 +49,28 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
 
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, "")
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
 # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -81,6 +81,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
+    hf_images = [asset.for_hf() for asset in image_assets]
+    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 0eca5cb53..d36e50387 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -5,7 +5,7 @@ from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS
 
 pytestmark = pytest.mark.vlm
 
@@ -15,12 +15,12 @@ _PREFACE = (
     "questions.")
 
 # The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
-    f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:",
-    f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    f"{_PREFACE} <image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
+    f"{_PREFACE} <image>\nUSER: What is the season?\nASSISTANT:",
+})
 
 
 def iter_llava_next_configs(model_name: str):
@@ -56,20 +56,20 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
 
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, " ")
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
 @pytest.mark.xfail(
@@ -78,8 +78,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -90,6 +90,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
+    hf_images = [asset.for_hf() for asset in image_assets]
+    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index a29d50df4..03c130466 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -6,17 +6,17 @@ from transformers import AutoTokenizer
 from vllm.config import VisionLanguageConfig
 from vllm.utils import is_cpu
 
-from ..conftest import IMAGE_FILES
+from ..conftest import IMAGE_ASSETS
 
 pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
     "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+    "cherry_blossom":
+    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",  # noqa: E501
+})
 
 
 def iter_phi3v_configs(model_name: str):
@@ -50,22 +50,22 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
 
-    hf_input_ids = [
-        input_id if input_id != image_token_id else 0
-        for idx, input_id in enumerate(input_ids)
+    hf_output_ids = [
+        token_id if token_id != image_token_id else 0
+        for idx, token_id in enumerate(output_ids)
     ]
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, "") \
         .replace("<s>", " ").replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
 target_dtype = "half"
@@ -82,8 +82,8 @@ if is_cpu():
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -94,6 +94,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
+    hf_images = [asset.for_hf() for asset in image_assets]
+    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py
index 51c352361..9ac48dfab 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
@@ -10,7 +10,7 @@ from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
 
 
 @pytest.mark.parametrize("dtype", ["half", "float"])
-def test_clip_image_processor(hf_images, dtype):
+def test_clip_image_processor(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
@@ -35,13 +35,13 @@ def test_clip_image_processor(hf_images, dtype):
         image_processor_revision=None,
     )
 
-    for image in hf_images:
+    for asset in image_assets:
         hf_result = hf_processor.preprocess(
-            image,
+            asset.pil_image,
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
+            ImagePixelData(asset.pil_image),
             model_config=model_config,
             vlm_config=vlm_config,
         )
@@ -59,7 +59,7 @@ def test_clip_image_processor(hf_images, dtype):
     reason="Inconsistent image processor being used due to lack "
     "of support for dynamic image token replacement")
 @pytest.mark.parametrize("dtype", ["half", "float"])
-def test_llava_next_image_processor(hf_images, dtype):
+def test_llava_next_image_processor(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
@@ -84,13 +84,13 @@ def test_llava_next_image_processor(hf_images, dtype):
         image_processor_revision=None,
     )
 
-    for image in hf_images:
+    for asset in image_assets:
         hf_result = hf_processor.preprocess(
-            image,
+            asset.pil_image,
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
+            ImagePixelData(asset.pil_image),
             model_config=model_config,
             vlm_config=vlm_config,
         )
@@ -107,7 +107,7 @@ def test_llava_next_image_processor(hf_images, dtype):
 @pytest.mark.xfail(
     reason="Example image pixels were not processed using HuggingFace")
 @pytest.mark.parametrize("dtype", ["float"])
-def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
+def test_image_pixel_types(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
@@ -129,14 +129,14 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
         image_processor_revision=None,
     )
 
-    for image, tensor in zip(hf_images, vllm_image_tensors):
+    for asset in image_assets:
         image_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
+            ImagePixelData(asset.pil_image),
             model_config=model_config,
             vlm_config=vlm_config,
         )
         tensor_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(tensor),
+            ImagePixelData(asset.pixel_values),
             model_config=model_config,
             vlm_config=vlm_config,
         )
-- 
GitLab


From 5bfd1bbc9831fed39632f071f16bb62373ec1249 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Wed, 26 Jun 2024 11:16:00 -0400
Subject: [PATCH 158/376] [Kernel] Adding bias epilogue support for
 `cutlass_scaled_mm` (#5560)

Co-authored-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt                                |   3 +-
 csrc/ops.h                                    |   3 +-
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 228 +++++++++++++-----
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 139 ++++++++---
 .../cutlass_w8a8/scaled_mm_entry.cu           |  32 ++-
 csrc/torch_bindings.cpp                       |   2 +-
 tests/kernels/test_cutlass.py                 | 100 +++++---
 vllm/_custom_ops.py                           |  10 +-
 8 files changed, 383 insertions(+), 134 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 801429096..ede9192cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
+set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
diff --git a/csrc/ops.h b/csrc/ops.h
index 6f0a7143c..ae04150ea 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -96,7 +96,8 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales);
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias);
 
 #endif
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 38a20a172..6ce25c5ac 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -77,24 +77,12 @@ struct enable_sm89_to_sm90 : Kernel {
 };
 
 /*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch._scaled_mm.
-
-   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
-   per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
 template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue {
- private:
+struct ScaledEpilogueBase {
+ protected:
   using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
 
   using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
@@ -102,6 +90,32 @@ struct ScaledEpilogue {
 
   using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
       OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
 
   using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
       cutlass::multiplies, float, float,
@@ -134,6 +148,53 @@ struct ScaledEpilogue {
   }
 };
 
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+    using BiasArgs = typename Bias::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
+                                                    bias_args};
+    return evt_compute_args;
+  }
+};
+
 template <typename Arch, template <typename> typename ArchGuard,
           typename ElementAB_, typename ElementD_,
           template <typename, typename> typename Epilogue_, typename TileShape,
@@ -168,13 +229,13 @@ struct cutlass_2x_gemm {
   // clang-format off
   using RowMajor = typename cutlass::layout::RowMajor;
   using ColumnMajor = typename cutlass::layout::ColumnMajor;
-  using KernelType = 
+  using KernelType =
     ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16, 
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16, 
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
       float, cutlass::layout::RowMajor, 4,
-      ElementAcc, float, cutlass::arch::OpClassTensorOp, 
-      Arch, 
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
       TileShape, WarpShape, InstructionShape,
       EVTD,
       cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
@@ -404,14 +465,13 @@ void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
-void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   TORCH_CHECK(a.dtype() == torch::kInt8);
   TORCH_CHECK(b.dtype() == torch::kInt8);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
@@ -420,78 +480,130 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (out.dtype() == torch::kBFloat16) {
     return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, a_scales, b_scales);
+        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
     return cutlass_gemm_caller<cutlass_2x_gemm<
         cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, a_scales, b_scales);
+        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
-void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
-                                      ScaledEpilogue>(out, a, b, a_scales,
-                                                      b_scales);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
-void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
+template <template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
 
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
       return cutlass_gemm_caller<cutlass_2x_gemm<
           cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::bfloat16_t, ScaledEpilogue, TileShape, WarpShape,
-          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
+      return cutlass_gemm_caller<
+          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
+                          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
+                          TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::half_t, ScaledEpilogue, TileShape, WarpShape,
-          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
+      return cutlass_gemm_caller<
+          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
+                          cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
+                          TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
+
+void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
+  }
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index cfa8f80f7..326ec02ca 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -59,6 +59,28 @@ struct enable_sm90_or_later : Kernel {
   }
 };
 
+/*
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using ScaleBDescriptor =
+      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
+          EpilogueDescriptor, float>;
+
+  using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
+      typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
 /*
    This epilogue function defines a quantized GEMM operation similar to
    torch.scaled_mm_.
@@ -76,21 +98,13 @@ struct enable_sm90_or_later : Kernel {
    per row or column.
 */
 template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue {
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
  private:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  using ScaleBDescriptor =
-      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
-          EpilogueDescriptor, float>;
-
-  using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
-      typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
 
   using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
       cutlass::multiplies, float, float,
@@ -120,6 +134,54 @@ struct ScaledEpilogue {
   }
 };
 
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, ElementD,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, ElementD,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using BiasDescriptor =
+      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
+          EpilogueDescriptor, ElementD>;
+
+  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      BiasDescriptor::Stages, typename EpilogueDescriptor::TileShape, ElementD,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<ElementD>, false>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleA_Args = typename ScaleA::Arguments;
+    using ScaleB_Args = typename ScaleB::Arguments;
+    using Bias_Args = typename Bias::Arguments;
+
+    ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    Bias_Args bias_args{static_cast<ElementD*>(bias.data_ptr())};
+
+    return ArgumentType{a_args, {b_args}, bias_args};
+  }
+};
+
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
@@ -440,41 +502,56 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
-void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+                                             Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t,
-                                             ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<
-          cutlass::float_e4m3_t, cutlass::bfloat16_t, ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, ScaledEpilogue>(
-          out, a, b, a_scales, b_scales);
+                                            cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
 
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBias>(
+        c, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogue>(c, a, b, a_scales,
+                                                           b_scales);
+  }
+}
+
 #endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index f4e582d78..81bf2d62d 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -6,23 +6,27 @@
 void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales);
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
 #endif
 
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
@@ -43,7 +47,8 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
 
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales) {
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias) {
   int32_t major_capability;
   int32_t minor_capability;
   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
@@ -66,6 +71,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
 
   if (version_num >= 90) {
@@ -73,19 +83,19 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
 
     // Guard against compilation issues for sm90 kernels
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
 #else
-    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
 #endif
   } else if (version_num == 89) {
     // Ada Lovelace
-    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
   } else if (version_num >= 80) {
     // Ampere
-    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
   } else {
     // Turing
     TORCH_CHECK(version_num >= 75);
-    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
   }
-}
+}
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 227b69d79..faf29e1f1 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -142,7 +142,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_mm(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales) -> ()");
+      "                  Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 4d09cd8ce..39de444be 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -32,6 +32,7 @@ def cutlass_fp8_gemm_helper(m: int,
                             k: int,
                             per_token_act_quant: bool,
                             per_out_channel_weight_quant: bool,
+                            bias: bool,
                             out_dtype: Type[torch.dtype] = torch.bfloat16,
                             device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
@@ -46,10 +47,17 @@ def cutlass_fp8_gemm_helper(m: int,
         (m_a_scales, 1), device=device, dtype=torch.float32) / 10)
     scale_b = (torch.randn(
         (1, n_b_scales), device=device, dtype=torch.float32) / 10)
-
-    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b * b.to(dtype=torch.float32)).to(out_dtype)
+    if bias:
+        # bias term should be > 1 so that the absolute tolerance can catch it
+        bias_t = torch.rand((n, ), device=device, dtype=out_dtype) + 1.0
+        out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias_t)
+    else:
+        out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
+        bias_t = 0
+
+    baseline = (torch.mm(scale_a * a.to(dtype=torch.float32),
+                         scale_b * b.to(dtype=torch.float32)) +
+                bias_t).to(out_dtype)
 
     assert torch.allclose(out, baseline, rtol=1e-2, atol=1e-1)
 
@@ -59,6 +67,7 @@ def cutlass_int8_gemm_helper(m: int,
                              k: int,
                              per_token_act_quant: bool,
                              per_out_channel_weight_quant: bool,
+                             bias: bool,
                              out_dtype: Type[torch.dtype] = torch.bfloat16,
                              device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
@@ -74,11 +83,17 @@ def cutlass_int8_gemm_helper(m: int,
     scale_b = (torch.randn(
         (1, n_b_scales), device=device, dtype=torch.float32) / 10)
 
-    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b *
-                        b.to(dtype=torch.float32)).to(dtype=out_dtype)
-
+    if bias:
+        # bias term should be > 1 so that the absolute tolerance can catch it
+        bias_t = torch.rand((n, ), device=device, dtype=out_dtype) + 1.0
+        out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias_t)
+    else:
+        out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
+        bias_t = 0
+
+    baseline = (torch.mm(scale_a * a.to(dtype=torch.float32),
+                         scale_b * b.to(dtype=torch.float32)) +
+                bias_t).to(dtype=out_dtype)
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
 
@@ -87,11 +102,12 @@ def cutlass_int8_gemm_helper(m: int,
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                          per_out_ch: bool):
-    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch)
+                          per_out_ch: bool, bias: bool):
+    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, bias)
 
 
 @pytest.mark.parametrize("m", [512, 222, 33, 1])
@@ -99,49 +115,72 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("bias", [True, False])
 def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                           per_out_ch: bool):
-    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch)
+                           per_out_ch: bool, bias: bool):
+    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, bias)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("bias", [True, False])
 def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
-                                        out_dtype: Type[torch.dtype]):
-    cutlass_int8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                             out_dtype)
+                                        out_dtype: Type[torch.dtype],
+                                        bias: bool):
+    cutlass_int8_gemm_helper(512,
+                             512,
+                             512,
+                             per_act_token,
+                             per_out_ch,
+                             bias,
+                             out_dtype=out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
-                                       out_dtype: Type[torch.dtype]):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                            out_dtype)
+                                       out_dtype: Type[torch.dtype],
+                                       bias: bool):
+    cutlass_fp8_gemm_helper(512,
+                            512,
+                            512,
+                            per_act_token,
+                            per_out_ch,
+                            bias,
+                            out_dtype=out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
-                                  device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
+                                  bias: bool, device: str):
+    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, bias,
                             torch.bfloat16, device)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
-                                   device: str):
-    cutlass_int8_gemm_helper(512, 512, 512, per_act_token, per_out_ch,
-                             torch.bfloat16, device)
+                                   bias: bool, device: str):
+    cutlass_int8_gemm_helper(512,
+                             512,
+                             512,
+                             per_act_token,
+                             per_out_ch,
+                             bias,
+                             out_dtype=torch.bfloat16,
+                             device=device)
 
 
 # For the following two tests:
@@ -151,20 +190,25 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
 # kernel must handle any M thrown at it.
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool):
+def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+                                  bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch)
+            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch, bias)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool):
+@pytest.mark.parametrize("bias", [True, False])
+def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+                                   bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch)
+            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
+                                     bias)
 
 
 # Test working with a subset of A and B
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e050c1172..479ea08e4 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -220,9 +220,12 @@ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 
 
-def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+def cutlass_scaled_mm(a: torch.Tensor,
+                      b: torch.Tensor,
+                      scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
-                      out_dtype: Type[torch.dtype]) -> torch.Tensor:
+                      out_dtype: Type[torch.dtype],
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
 
@@ -230,7 +233,8 @@ def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b)
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
+
     return out
 
 
-- 
GitLab


From c54269d967f2546868d9a52a10c110adc8f9822a Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Wed, 26 Jun 2024 16:54:22 +0000
Subject: [PATCH 159/376] [Frontend] Add tokenize/detokenize endpoints (#5054)

---
 tests/entrypoints/test_openai_server.py       | 49 +++++++++++++++++++
 vllm/entrypoints/openai/api_server.py         | 31 +++++++++++-
 vllm/entrypoints/openai/protocol.py           | 21 ++++++++
 vllm/entrypoints/openai/serving_completion.py | 32 +++++++++++-
 vllm/entrypoints/openai/serving_engine.py     | 16 ++++--
 5 files changed, 143 insertions(+), 6 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 5196d8181..4d9bfb460 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -9,6 +9,7 @@ import pytest
 # using Ray for overall ease of process management, parallel requests,
 # and debugging.
 import ray
+import requests
 import torch
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
@@ -1366,5 +1367,53 @@ async def test_long_seed(client: openai.AsyncOpenAI):
                 or "less_than_equal" in exc_info.value.message)
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_tokenize(server, client: openai.AsyncOpenAI, model_name: str):
+    base_url = str(client.base_url)[:-3]
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
+
+    for add_special in [False, True]:
+        prompt = "This is a test prompt."
+        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+        response = requests.post(base_url + "/tokenize",
+                                 json={
+                                     "add_special_tokens": add_special,
+                                     "model": model_name,
+                                     "prompt": prompt
+                                 })
+        response.raise_for_status()
+        assert response.json() == {
+            "tokens": tokens,
+            "count": len(tokens),
+            "max_model_len": 8192
+        }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_detokenize(server, client: openai.AsyncOpenAI, model_name: str):
+    base_url = str(client.base_url)[:-3]
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
+
+    prompt = "This is a test prompt."
+    tokens = tokenizer.encode(prompt, add_special_tokens=False)
+
+    response = requests.post(base_url + "detokenize",
+                             json={
+                                 "model": model_name,
+                                 "tokens": tokens
+                             })
+    response.raise_for_status()
+    assert response.json() == {"prompt": prompt}
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ea6275920..a708176c2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -19,10 +19,17 @@ import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.cli_args import make_arg_parser
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               ChatCompletionResponse,
                                               CompletionRequest,
-                                              EmbeddingRequest, ErrorResponse)
+                                              DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              EmbeddingRequest, ErrorResponse,
+                                              TokenizeRequest,
+                                              TokenizeResponse)
+# yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
@@ -85,6 +92,28 @@ async def health() -> Response:
     return Response(status_code=200)
 
 
+@app.post("/tokenize")
+async def tokenize(request: TokenizeRequest):
+    generator = await openai_serving_completion.create_tokenize(request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        assert isinstance(generator, TokenizeResponse)
+        return JSONResponse(content=generator.model_dump())
+
+
+@app.post("/detokenize")
+async def detokenize(request: DetokenizeRequest):
+    generator = await openai_serving_completion.create_detokenize(request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        assert isinstance(generator, DetokenizeResponse)
+        return JSONResponse(content=generator.model_dump())
+
+
 @app.get("/v1/models")
 async def show_available_models():
     models = await openai_serving_chat.show_available_models()
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b57d79859..7fb1af158 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -699,3 +699,24 @@ class BatchRequestOutput(OpenAIBaseModel):
     # For requests that failed with a non-HTTP error, this will contain more
     # information on the cause of the failure.
     error: Optional[Any]
+
+
+class TokenizeRequest(OpenAIBaseModel):
+    model: str
+    prompt: str
+    add_special_tokens: bool = Field(default=True)
+
+
+class TokenizeResponse(OpenAIBaseModel):
+    tokens: List[int]
+    count: int
+    max_model_len: int
+
+
+class DetokenizeRequest(OpenAIBaseModel):
+    model: str
+    tokens: List[int]
+
+
+class DetokenizeResponse(OpenAIBaseModel):
+    prompt: str
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index c775fa6da..8741893c9 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -16,7 +16,11 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
                                               CompletionResponseChoice,
                                               CompletionResponseStreamChoice,
                                               CompletionStreamResponse,
-                                              UsageInfo)
+                                              DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              TokenizeRequest,
+                                              TokenizeResponse, UsageInfo)
+# yapf: enable
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing)
 from vllm.logger import init_logger
@@ -442,3 +446,29 @@ class OpenAIServingCompletion(OpenAIServing):
             tokens=out_tokens,
             top_logprobs=out_top_logprobs,
         )
+
+    async def create_tokenize(self,
+                              request: TokenizeRequest) -> TokenizeResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        (input_ids, input_text) = self._validate_prompt_and_tokenize(
+            request,
+            prompt=request.prompt,
+            add_special_tokens=request.add_special_tokens)
+
+        return TokenizeResponse(tokens=input_ids,
+                                count=len(input_ids),
+                                max_model_len=self.max_model_len)
+
+    async def create_detokenize(
+            self, request: DetokenizeRequest) -> DetokenizeResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        (input_ids, input_text) = self._validate_prompt_and_tokenize(
+            request, prompt_ids=request.tokens)
+
+        return DetokenizeResponse(prompt=input_text)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 6b5a62efc..84e412772 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -10,9 +10,10 @@ from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest,
+                                              DetokenizeRequest,
                                               EmbeddingRequest, ErrorResponse,
                                               ModelCard, ModelList,
-                                              ModelPermission)
+                                              ModelPermission, TokenizeRequest)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob
@@ -99,8 +100,9 @@ class OpenAIServing:
         return json_str
 
     async def _check_model(
-        self, request: Union[CompletionRequest, ChatCompletionRequest,
-                             EmbeddingRequest]
+        self, request: Union[ChatCompletionRequest, CompletionRequest,
+                             DetokenizeRequest, EmbeddingRequest,
+                             TokenizeRequest]
     ) -> Optional[ErrorResponse]:
         if request.model in self.served_model_names:
             return None
@@ -126,7 +128,8 @@ class OpenAIServing:
     def _validate_prompt_and_tokenize(
             self,
             request: Union[ChatCompletionRequest, CompletionRequest,
-                           EmbeddingRequest],
+                           DetokenizeRequest, EmbeddingRequest,
+                           TokenizeRequest],
             prompt: Optional[str] = None,
             prompt_ids: Optional[List[int]] = None,
             truncate_prompt_tokens: Optional[Annotated[int,
@@ -174,6 +177,11 @@ class OpenAIServing:
                     f"generation. Please reduce the length of the input.", )
             return input_ids, input_text
 
+        # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
+        # and does not require model context length validation
+        if isinstance(request, (TokenizeRequest, DetokenizeRequest)):
+            return input_ids, input_text
+
         if request.max_tokens is None:
             if token_num >= self.max_model_len:
                 raise ValueError(
-- 
GitLab


From cbc53b6b8d87b29949ce13d504750f63714df532 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Jun 2024 11:07:49 -0700
Subject: [PATCH 160/376] [Hardware][TPU] Support parallel sampling & Swapping
 (#5855)

---
 vllm/attention/backends/pallas.py | 30 +++++++---
 vllm/worker/tpu_model_runner.py   | 76 +++++++++++++++---------
 vllm/worker/tpu_worker.py         | 97 ++++++++++++++++++++++++-------
 3 files changed, 147 insertions(+), 56 deletions(-)

diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 62b4a144f..121ca9ec4 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -28,21 +28,35 @@ class PallasAttentionBackend(AttentionBackend):
     ) -> Tuple[int, ...]:
         return (num_kv_heads, num_blocks, block_size, head_size)
 
+    @torch.compile(backend="openxla")
     @staticmethod
     def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: Dict[int, int],
+        src_kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        dst_kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        src_to_dst: Tuple[torch.Tensor, torch.Tensor],
     ) -> None:
-        raise NotImplementedError("swap_blocks is not implemented.")
+        src_k_cache, src_v_cache = src_kv_cache
+        dst_k_cache, dst_v_cache = dst_kv_cache
+        torch.ops.xla.dynamo_set_buffer_donor_(dst_k_cache, True)
+        torch.ops.xla.dynamo_set_buffer_donor_(dst_v_cache, True)
 
+        device = dst_k_cache.device
+        src_indices, dst_indices = src_to_dst
+        dst_k_cache[:, dst_indices] = src_k_cache[:, src_indices].to(device)
+        dst_v_cache[:, dst_indices] = src_v_cache[:, src_indices].to(device)
+
+    @torch.compile(backend="openxla")
     @staticmethod
     def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        src_to_dists: Tuple[torch.Tensor, torch.Tensor],
     ) -> None:
-        # TODO(woosuk): Implement this.
-        raise NotImplementedError("copy_blocks is not implemented.")
+        src_indices, dst_indices = src_to_dists
+        for k_cache, v_cache in kv_caches:
+            torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
+            k_cache[:, dst_indices] = k_cache[:, src_indices]
+            torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
+            v_cache[:, dst_indices] = v_cache[:, src_indices]
 
 
 @dataclass
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 2c70c1f91..c3ccbd025 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -22,6 +22,9 @@ logger = init_logger(__name__)
 _PAD_SLOT_ID = 0  # FIXME(woosuk)
 # FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
 _ENABLE_TOP_P = False
+# FIXME(woosuk): A temporary hack to support `n > 1`.
+# This can significantly affect the performance if too large.
+_MAX_NUM_SAMPLES = 128
 
 
 class TPUModelRunner:
@@ -143,8 +146,9 @@ class TPUModelRunner:
         p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
 
         # Dummy run.
+        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
         self.model(token_ids, position_ids, kv_caches, attn_metadata,
-                   input_lens, t, p)
+                   input_lens, t, p, num_samples)
 
     def warmup_model(
         self,
@@ -268,14 +272,11 @@ class TPUModelRunner:
         input_positions: List[List[int]] = []
         slot_mapping: List[List[int]] = []
         context_lens: List[int] = []
-        num_seq_groups = len(seq_group_metadata_list)
-        batch_size = _get_padded_batch_size(num_seq_groups)
 
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+        batch_idx = 0
+        for seq_group_metadata in seq_group_metadata_list:
             assert not seq_group_metadata.is_prompt
-
             seq_ids = list(seq_group_metadata.seq_data.keys())
-
             for seq_id in seq_ids:
                 seq_data = seq_group_metadata.seq_data[seq_id]
                 generation_token = seq_data.get_last_token_id()
@@ -288,14 +289,16 @@ class TPUModelRunner:
 
                 assert seq_group_metadata.block_tables is not None
                 block_table = seq_group_metadata.block_tables[seq_id]
-                self.block_tables[i, :len(block_table)] = block_table
+                self.block_tables[batch_idx, :len(block_table)] = block_table
+                batch_idx += 1
 
                 block_number = block_table[position // self.block_size]
                 block_offset = position % self.block_size
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append([slot])
 
-        num_paddings = batch_size - num_seq_groups
+        batch_size = _get_padded_batch_size(batch_idx)
+        num_paddings = batch_size - batch_idx
         input_tokens = input_tokens + [[0]] * num_paddings
         input_positions = input_positions + [[0]] * num_paddings
         slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings
@@ -333,14 +336,13 @@ class TPUModelRunner:
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         padded_batch_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
         assert len(seq_group_metadata_list) > 0
         t = []
         p = []
+        best_of = []
         for seq_group_metadata in seq_group_metadata_list:
-            assert seq_group_metadata.sampling_params is not None
             sampling_params = seq_group_metadata.sampling_params
-
             # NOTE(woosuk): Here we mimic argmax sampling by applying a very
             # low temperature. This is not accurate.
             t.append(sampling_params.temperature
@@ -354,10 +356,11 @@ class TPUModelRunner:
                 raise NotImplementedError(
                     "Top-k sampling is currently disabled for the TPU backend "
                     "due to performance issues.")
-            if sampling_params.best_of > 1:
+            if sampling_params.best_of > _MAX_NUM_SAMPLES:
                 raise NotImplementedError(
-                    "best_of > 1 is not currently supported by the TPU "
+                    f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
                     "backend.")
+            best_of.append(sampling_params.best_of)
             if sampling_params.use_beam_search:
                 raise NotImplementedError(
                     "Beam search is not supported by the TPU backend.")
@@ -369,13 +372,19 @@ class TPUModelRunner:
                     "prompt_logprobs is not currently supported by the TPU "
                     "backend.")
 
-        num_paddings = padded_batch_size - len(seq_group_metadata_list)
+            # Repeat the sampling params if the seq group has multiple seqs.
+            num_seqs = len(seq_group_metadata.seq_data)
+            t += [t[-1]] * (num_seqs - 1)
+            p += [p[-1]] * (num_seqs - 1)
+            best_of += [best_of[-1]] * (num_seqs - 1)
+
+        num_paddings = padded_batch_size - len(t)
         t += [1.0] * num_paddings
         p += [1.0] * num_paddings
 
         t = torch.tensor(t, dtype=torch.float32, device=self.device)
         p = torch.tensor(p, dtype=torch.float32, device=self.device)
-        return t, p
+        return t, p, best_of
 
     def _execute_model(
         self,
@@ -392,28 +401,41 @@ class TPUModelRunner:
         else:
             inputs = self._prepare_decode(seq_group_metadata_list)
         padded_batch_size = inputs[0].shape[0]
-        t, p = self._prepare_sample(seq_group_metadata_list, padded_batch_size)
+        t, p, best_of = self._prepare_sample(seq_group_metadata_list,
+                                             padded_batch_size)
+        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
 
         # Execute the model.
         next_token_ids = self.model(inputs[0], inputs[1], kv_caches,
-                                    *inputs[2:], t, p)
+                                    *inputs[2:], t, p, num_samples)
         # Retrieve the outputs to CPU.
         next_token_ids = next_token_ids.cpu().tolist()
 
         # NOTE(woosuk): Minimal code to construct the sampler outputs.
         # The TPU backend does not reuse the sampler, since the TPU backend
         # does not support the advanced sampling parameters such as logprobs.
-        i = 0
+        zero_logprob = Logprob(0.0)
+        batch_idx = 0
         sampler_outputs = []
         for seq_group_metadata in seq_group_metadata_list:
             seq_outputs = []
             seq_ids = list(seq_group_metadata.seq_data.keys())
-            for seq_id in seq_ids:
-                next_token_id = next_token_ids[i]
-                seq_outputs.append(
-                    SequenceOutput(seq_id, next_token_id,
-                                   {next_token_id: Logprob(0.0)}))
-                i += 1
+            if is_prompt:
+                assert len(seq_ids) == 1
+                seq_id = seq_ids[0]
+                for i in range(best_of[batch_idx]):
+                    next_token_id = next_token_ids[batch_idx][i]
+                    seq_outputs.append(
+                        SequenceOutput(seq_id, next_token_id,
+                                       {next_token_id: zero_logprob}))
+                batch_idx += 1
+            else:
+                for seq_id in seq_ids:
+                    next_token_id = next_token_ids[batch_idx][0]
+                    seq_outputs.append(
+                        SequenceOutput(seq_id, next_token_id,
+                                       {next_token_id: zero_logprob}))
+                    batch_idx += 1
             sampler_outputs.append(
                 CompletionSequenceGroupOutput(seq_outputs, None))
         return sampler_outputs
@@ -458,6 +480,7 @@ class ModelWrapper(nn.Module):
         input_lens: torch.Tensor,
         t: torch.Tensor,
         p: torch.Tensor,
+        num_samples: int,
     ) -> torch.Tensor:
         """Executes the forward pass of the model and samples the next token.
 
@@ -520,8 +543,9 @@ class ModelWrapper(nn.Module):
         if _ENABLE_TOP_P:
             logits = _apply_top_p(logits, p.unsqueeze(dim=1))
         probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
-        # FIXME(woosuk): best_of > 1 is not supported.
-        next_token_ids = torch.multinomial(probs, num_samples=1).squeeze(dim=1)
+        next_token_ids = torch.multinomial(probs,
+                                           num_samples,
+                                           replacement=True)
         return next_token_ids
 
 
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index cd72c7119..c85bf6892 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -1,5 +1,5 @@
 import os
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch_xla.core.xla_model as xm
@@ -117,19 +117,26 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
 
+        dtype_btyes = get_dtype_size(self.cache_dtype)
+        block_size = self.cache_config.block_size
+        block_size_bytes = (dtype_btyes * block_size * num_layers * 2 *
+                            head_size * num_kv_heads)
+
+        # Calculate the TPU KV cache size based on profiling.
         m = xm.get_memory_info(self.device)
         total_memory_size = m["bytes_limit"]
         usable_memory_size = int(total_memory_size *
                                  self.cache_config.gpu_memory_utilization)
         profiled = m["bytes_used"]  # Weights + intermediate activations.
-        kv_cache_bytes = max(usable_memory_size - profiled, 0)
-        dtype_btyes = get_dtype_size(self.cache_dtype)
-        block_size = self.cache_config.block_size
-        num_tpu_blocks = (kv_cache_bytes //
-                          (dtype_btyes * block_size * num_layers * 2 *
-                           head_size * num_kv_heads))
+        tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
+        num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes
         num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.
-        return num_tpu_blocks, 0
+
+        # Calculate the CPU KV cache size based on the config.
+        num_cpu_blocks = (self.cache_config.swap_space_bytes //
+                          block_size_bytes)
+        num_cpu_blocks = (num_cpu_blocks // 8) * 8  # Round down to 8.
+        return num_tpu_blocks, num_cpu_blocks
 
     def initialize_cache(
         self,
@@ -145,15 +152,19 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
 
+        self.cpu_cache = []
         self.tpu_cache = []
         tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
             num_gpu_blocks, self.block_size, num_kv_heads, head_size)
         for _ in range(num_layers):
-            key_cache = torch.zeros(tpu_cache_shape,
-                                    dtype=dtype,
-                                    device=self.device)
-            value_cache = torch.zeros_like(key_cache)
-            self.tpu_cache.append((key_cache, value_cache))
+            tpu_k_cache = torch.zeros(tpu_cache_shape,
+                                      dtype=dtype,
+                                      device=self.device)
+            tpu_v_cache = torch.zeros_like(tpu_k_cache)
+            self.tpu_cache.append((tpu_k_cache, tpu_v_cache))
+            cpu_k_cache = torch.zeros_like(tpu_k_cache, device="cpu")
+            cpu_v_cache = torch.zeros_like(tpu_v_cache, device="cpu")
+            self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
         self._warmup_model()
 
     def _warmup_model(self) -> None:
@@ -187,22 +198,48 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         if not self.is_driver_worker:
             self._execute_model_non_driver()
             return []
-
         assert execute_model_req is not None
-        # Currently, TPUWorker does not support swapping.
-        # TODO(woosuk): Support block copying.
-        assert len(execute_model_req.blocks_to_swap_in) == 0, (
-            "Swapping is not supported for the TPU backend.")
-        assert len(execute_model_req.blocks_to_swap_out) == 0, (
-            "Swapping is not supported for the TPU backend.")
-        assert len(execute_model_req.blocks_to_copy) == 0
-
+        # Issue cache operations.
+        self.cache_swap(
+            execute_model_req.blocks_to_swap_in,
+            execute_model_req.blocks_to_swap_out,
+            execute_model_req.blocks_to_copy,
+        )
+        # Run the model.
         seq_group_metadata_list = execute_model_req.seq_group_metadata_list
         assert len(seq_group_metadata_list) > 0
         output = self.model_runner.execute_model(seq_group_metadata_list,
                                                  self.tpu_cache)
         return [output]
 
+    def cache_swap(
+        self,
+        blocks_to_swap_in: List[Tuple[int, int]],
+        blocks_to_swap_out: List[Tuple[int, int]],
+        blocks_to_copy: List[Tuple[int, int]],
+    ) -> None:
+        attn_backend = self.model_runner.attn_backend
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+
+        if blocks_to_swap_in:
+            # Swap from CPU to TPU.
+            src_to_dst = _make_src_to_dst(blocks_to_swap_in, "cpu",
+                                          self.device)
+            for i in range(num_layers):
+                attn_backend.swap_blocks(self.cpu_cache[i], self.tpu_cache[i],
+                                         src_to_dst)
+        if blocks_to_swap_out:
+            # Swap from TPU to CPU.
+            src_to_dst = _make_src_to_dst(blocks_to_swap_out, self.device,
+                                          "cpu")
+            for i in range(num_layers):
+                attn_backend.swap_blocks(self.tpu_cache[i], self.cpu_cache[i],
+                                         src_to_dst)
+        if blocks_to_copy:
+            src_to_dst = _make_src_to_dst(blocks_to_copy, self.device,
+                                          self.device)
+            attn_backend.copy_blocks(self.tpu_cache, src_to_dst)
+
     def start_worker_execution_loop(self) -> None:
         while self._execute_model_non_driver():
             pass
@@ -210,3 +247,19 @@ class TPUWorker(LoraNotSupportedWorkerBase):
     def _execute_model_non_driver(self) -> bool:
         self.model_runner.execute_model(None, self.tpu_cache)
         return True
+
+
+def _make_src_to_dst(
+    mapping: List[Tuple[int, int]],
+    src_device: Union[torch.device, str],
+    dst_device: Union[torch.device, str],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    src_indices = [i for i, _ in mapping]
+    dst_indices = [i for _, i in mapping]
+    src_indices = torch.tensor(src_indices,
+                               device=src_device,
+                               dtype=torch.int64)
+    dst_indices = torch.tensor(dst_indices,
+                               device=dst_device,
+                               dtype=torch.int64)
+    return src_indices, dst_indices
-- 
GitLab


From f5c8628fdc78bf9ca70206ef41175030fb67e870 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Jun 2024 13:42:40 -0700
Subject: [PATCH 161/376] [Bugfix][TPU] Fix CPU cache allocation (#5869)

---
 vllm/attention/backends/pallas.py | 5 ++---
 vllm/worker/tpu_worker.py         | 8 ++++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 121ca9ec4..5dec11e2e 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -37,11 +37,10 @@ class PallasAttentionBackend(AttentionBackend):
     ) -> None:
         src_k_cache, src_v_cache = src_kv_cache
         dst_k_cache, dst_v_cache = dst_kv_cache
+        src_indices, dst_indices = src_to_dst
+        device = dst_k_cache.device
         torch.ops.xla.dynamo_set_buffer_donor_(dst_k_cache, True)
         torch.ops.xla.dynamo_set_buffer_donor_(dst_v_cache, True)
-
-        device = dst_k_cache.device
-        src_indices, dst_indices = src_to_dst
         dst_k_cache[:, dst_indices] = src_k_cache[:, src_indices].to(device)
         dst_v_cache[:, dst_indices] = src_v_cache[:, src_indices].to(device)
 
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index c85bf6892..28f460c31 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -156,14 +156,18 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         self.tpu_cache = []
         tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
             num_gpu_blocks, self.block_size, num_kv_heads, head_size)
+        cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
+            num_cpu_blocks, self.block_size, num_kv_heads, head_size)
         for _ in range(num_layers):
             tpu_k_cache = torch.zeros(tpu_cache_shape,
                                       dtype=dtype,
                                       device=self.device)
             tpu_v_cache = torch.zeros_like(tpu_k_cache)
             self.tpu_cache.append((tpu_k_cache, tpu_v_cache))
-            cpu_k_cache = torch.zeros_like(tpu_k_cache, device="cpu")
-            cpu_v_cache = torch.zeros_like(tpu_v_cache, device="cpu")
+            cpu_k_cache = torch.zeros(cpu_cache_shape,
+                                      dtype=dtype,
+                                      device="cpu")
+            cpu_v_cache = torch.zeros_like(cpu_k_cache)
             self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
         self._warmup_model()
 
-- 
GitLab


From 38a1674abbba38344543170cb552e88e7f619167 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com>
Date: Wed, 26 Jun 2024 17:53:04 -0400
Subject: [PATCH 162/376] Support CPU inference with VSX PowerPC ISA (#5652)

---
 Dockerfile.ppc64le         |  22 ++
 cmake/cpu_extension.cmake  |  11 +-
 csrc/cpu/cpu_types.hpp     | 514 +-----------------------------------
 csrc/cpu/cpu_types_vsx.hpp | 491 +++++++++++++++++++++++++++++++++++
 csrc/cpu/cpu_types_x86.hpp | 515 +++++++++++++++++++++++++++++++++++++
 csrc/ops.h                 |   1 +
 requirements-cpu.txt       |   6 +-
 7 files changed, 1049 insertions(+), 511 deletions(-)
 create mode 100644 Dockerfile.ppc64le
 create mode 100644 csrc/cpu/cpu_types_vsx.hpp
 create mode 100644 csrc/cpu/cpu_types_x86.hpp

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
new file mode 100644
index 000000000..d4e4c483c
--- /dev/null
+++ b/Dockerfile.ppc64le
@@ -0,0 +1,22 @@
+FROM mambaorg/micromamba
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+USER root
+
+RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# Some packages in requirements-cpu are installed here
+# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
+# Currently these may not be available for venv or pip directly
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+# These packages will be in rocketce eventually
+RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+WORKDIR /vllm-workspace
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 511e443f7..690559ee2 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -46,6 +46,8 @@ is_avx512_disabled(AVX512_DISABLED)
 
 find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -68,8 +70,15 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
+elseif (POWER9_FOUND OR POWER10_FOUND)
+    message(STATUS "PowerPC detected")
+    # Check for PowerPC VSX support
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mvsx"
+        "-mcpu=native"
+        "-mtune=native")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
 endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index d7621aaae..0213be091 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -2,514 +2,14 @@
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP
 
-#include <immintrin.h>
-#include <torch/all.h>
-
-#ifndef __AVX2__
-static_assert(false, "AVX2 must be supported for the current implementation.");
-#endif
-
-namespace vec_op {
-
-// FIXME: FP16 is not fully supported in Torch-CPU
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-
-#ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
-#else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
-#endif
-
-#define FORCE_INLINE __attribute__((always_inline)) inline
-
-namespace {
-template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
-  (f(std::integral_constant<T, indexes>{}), ...);
-}
-}; // namespace
-
-template <typename T, T count, typename F,
-          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
-  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-
-template <typename T> struct Vec {
-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
-};
-
-struct FP32Vec8;
-struct FP32Vec16;
-
-#ifdef __AVX512FP16__
-struct FP16Vec8 : public Vec<FP16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __m128h reg;
-
-  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
-
-  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
-
-  explicit FP16Vec8(__m128h data) : reg(data) {}
-
-  FP16Vec8 operator*(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_mul_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator+(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_add_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator-(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_sub_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator/(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_div_ph(reg, b.reg));
-  }
-
-  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
-};
-#endif
-
-struct BF16Vec8 : public Vec<BF16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __m128i reg;
-
-  explicit BF16Vec8(const void *ptr)
-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
-
-  explicit BF16Vec8(const FP32Vec8 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
-};
-
-struct BF16Vec16 : public Vec<BF16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  __m256i reg;
-
-  explicit BF16Vec16(const void *ptr)
-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
-
-  explicit BF16Vec16(const FP32Vec16 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
-};
-
-#ifdef __AVX512F__
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  __m512i reg;
-
-  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
-
-  explicit BF16Vec32(__m512i data) : reg(data) {}
-
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
-      : reg((__m512i)_mm512_inserti32x4(
-            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
-                                                      (__m128i)vec8_data.reg),
-                                                  (__m128i)vec8_data.reg, 1),
-                               (__m128i)vec8_data.reg, 2),
-            (__m128i)vec8_data.reg, 3)) {}
-
-  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
-};
-#else
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  __m256i reg_low;
-  __m256i reg_high;
-
-  explicit BF16Vec32(const void *ptr)
-      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
-        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
-
-  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
-                                                  reg_high(high) {}
-
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
-      : reg_low((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)),
-        reg_high((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)) {}
-
-  void save(void *ptr) const {
-    *reinterpret_cast<__m256i *>(ptr) = reg_low;
-    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
-  }
-};
-#endif
-
-struct FP32Vec4 : public Vec<FP32Vec4> {
-  constexpr static int VEC_ELEM_NUM = 4;
-  union AliasReg {
-    __m128 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m128 reg;
-
-  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
-
-  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
-
-  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
-
-  explicit FP32Vec4(__m128 data) : reg(data) {}
-
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
-};
-
-struct FP32Vec8 : public Vec<FP32Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  union AliasReg {
-    __m256 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m256 reg;
-
-  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
-
-  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
-
-  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
-
-  explicit FP32Vec8(__m256 data) : reg(data) {}
-
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
-
-#ifdef __AVX512FP16__
-  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
-#endif
-
-  explicit FP32Vec8(const BF16Vec8 &v)
-      : reg(_mm256_castsi256_ps(
-            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
-
-  float reduce_sum() const {
-    AliasReg ar;
-    ar.reg = reg;
-    float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
-
-    return result;
-  }
-
-  FP32Vec8 exp() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
-                                  expf(ar.values[5]), expf(ar.values[4]),
-                                  expf(ar.values[3]), expf(ar.values[2]),
-                                  expf(ar.values[1]), expf(ar.values[0])));
-  }
-
-  FP32Vec8 tanh() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
-                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
-                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
-                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
-  }
-
-  FP32Vec8 er() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
-                                  erf(ar.values[5]), erf(ar.values[4]),
-                                  erf(ar.values[3]), erf(ar.values[2]),
-                                  erf(ar.values[1]), erf(ar.values[0])));
-  }
-
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_add_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_div_ps(reg, b.reg));
-  }
-
-  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
-};
-
-#ifdef __AVX512F__
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  union AliasReg {
-    __m512 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m512 reg;
-
-  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
-
-  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
-
-  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
-
-  explicit FP32Vec16(__m512 data) : reg(data) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
-
-  explicit FP32Vec16(const FP32Vec4 &data)
-      : reg((__m512)_mm512_inserti32x4(
-            _mm512_inserti32x4(
-                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
-                                   (__m128i)data.reg, 1),
-                (__m128i)data.reg, 2),
-            (__m128i)data.reg, 3)) {}
-
-  explicit FP32Vec16(const FP32Vec8 &data)
-      : reg((__m512)_mm512_inserti32x8(
-            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
-
-  explicit FP32Vec16(const BF16Vec16 &v)
-      : reg(_mm512_castsi512_ps(
-            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_add_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_div_ps(reg, b.reg));
-  }
-
-  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
-    return _mm512_mask_reduce_add_ps(mask, reg);
-  }
-
-  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
-};
+#if defined(__x86_64__)
+  //x86 implementation
+  #include "cpu_types_x86.hpp"
+#elif defined(__POWER9_VECTOR__)
+  //ppc implementation
+  #include "cpu_types_vsx.hpp"
 #else
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  union AliasReg {
-    __m256 reg;
-    float values[8];
-  };
-
-  __m256 reg_low;
-  __m256 reg_high;
-
-  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
-                                reg_high(_mm256_set1_ps(v)) {}
-
-  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
-                         reg_high(_mm256_set1_ps(0.0)) {}
-
-  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
-                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
-
-  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
-                                              reg_high(data.reg_high) {}
-
-  explicit FP32Vec16(const FP32Vec4 &data)
-      : reg_low((__m256)_mm256_inserti128_si256(
-                _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)),
-        reg_high((__m256)_mm256_inserti128_si256(
-                 _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)) {}
-
-  explicit FP32Vec16(const FP32Vec8 &data)
-      : reg_low(data.reg), reg_high(data.reg) {}
-
-  explicit FP32Vec16(const BF16Vec16 &v) {
-    __m128i low = _mm256_extractf128_si256(v.reg, 0);
-    __m128i high = _mm256_extractf128_si256(v.reg, 1);
-
-    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
-    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
-
-    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
-    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
-
-    reg_low = _mm256_castsi256_ps(v_low_shifted);
-    reg_high = _mm256_castsi256_ps(v_high_shifted);
-  }
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
-                     _mm256_mul_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
-                     _mm256_add_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
-                     _mm256_sub_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
-                     _mm256_div_ps(reg_high, b.reg_high));
-  }
-
-  float reduce_sum() const {
-    FP32Vec8 low = FP32Vec8(reg_low);
-    FP32Vec8 high = FP32Vec8(reg_high);
-    return low.reduce_sum() + high.reduce_sum();
-  }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    float sum = 0.0;
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    uint32_t mask = base_mask << (idx * group_size);
-
-    AliasReg ar;
-
-    auto func = [&sum, &mask, &ar](int i) {
-      int flag = mask & 0x1;
-      mask = mask >> 1;
-      if (flag != 0) sum += ar.values[i];
-    };
-
-    ar.reg = reg_low;
-    unroll_loop<int, 8>(func);
-
-    ar.reg = reg_high;
-    unroll_loop<int, 8>(func);
-
-    return sum;
-  }
-
-  void save(float *ptr) const {
-    _mm256_storeu_ps(ptr, reg_low);
-    _mm256_storeu_ps(ptr + 8, reg_high);
-  }
-};
-#endif
-
-template <typename T> struct VecType { using vec_type = void; };
-
-template <typename T> using vec_t = typename VecType<T>::vec_type;
-
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
-
-#ifdef __AVX512FP16__
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+  #warning "unsupported vLLM cpu implementation"
 #endif
 
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
-
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
-
-#ifdef __AVX512FP16__
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<_Float16 *>(ptr) = v;
-}
-#endif
-
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
-  acc = acc + a * b;
-}
-
-#ifdef __AVX512BF16__
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
-    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
-
-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
-  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
-}
-#else
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
-  *ptr = *(v_ptr + 1);
-}
-
-#ifdef __AVX512F__
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg(_mm256_cvtepi32_epi16(
-          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
-    : reg(_mm512_cvtepi32_epi16(
-          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
-#else
-namespace{
-__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
-  __m256i ai = _mm256_castps_si256(a);
-  ai = _mm256_srli_epi32(ai, 16);
-  ai = _mm256_packus_epi32(ai, ai);
-  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
-  return _mm256_extracti128_si256(ai, 0);
-}
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
-  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
-  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
-  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
-}
-#endif // __AVX512F__
-#endif // __AVX512BF16__
-
-inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
-
-}; // namespace vec_op
-
 #endif
diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
new file mode 100644
index 000000000..b50bdadc5
--- /dev/null
+++ b/csrc/cpu/cpu_types_vsx.hpp
@@ -0,0 +1,491 @@
+
+#ifndef CPU_TYPES_VSX_HPP
+#define CPU_TYPES_VSX_HPP
+
+#include <altivec.h>
+#include <cmath>
+#include <torch/all.h>
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+typedef struct ss16x8x2_t {
+  __vector signed short val[2];
+} ss16x8x2_t;
+
+typedef struct ss16x8x4_t {
+  __vector signed short val[4];
+} ss16x8x4_t;
+
+typedef struct f32x4x2_t {
+  __vector float val[2];
+} f32x4x2_t;
+
+typedef struct f32x4x4_t {
+  __vector float val[4];
+} f32x4x4_t;
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __vector signed short reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  ss16x8x2_t reg;
+
+  explicit BF16Vec16(const void *ptr) {
+    // Load 256 bits in two parts
+    reg.val[0] = (__vector signed short)vec_xl(0,  (signed short *)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
+  }
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  void save(void *ptr) const {
+    // Save 256 bits in two parts
+    vec_xst(reg.val[0], 0, (signed short *)ptr);
+    vec_xst(reg.val[1], 16, (signed short *)ptr);
+  }
+};
+
+const static __vector signed short zero = vec_splats((signed short)0);
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  ss16x8x4_t reg;
+  explicit BF16Vec32(const void *ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
+
+  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
+
+  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg
+  }) {}
+
+  void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __vector float reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __vector float reg;
+
+  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
+
+  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
+
+  explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
+
+  explicit FP32Vec4(__vector float data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    f32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x2_t reg;
+
+  explicit FP32Vec8(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+  }
+
+  explicit FP32Vec8() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec8(const float *ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+  }
+
+  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+  }
+
+  explicit FP32Vec8(const BF16Vec8 &v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::exp(ar.values[0]);
+    ret.val[0][1] = std::exp(ar.values[1]);
+    ret.val[0][2] = std::exp(ar.values[2]);
+    ret.val[0][3] = std::exp(ar.values[3]);
+    ret.val[1][0] = std::exp(ar.values[4]);
+    ret.val[1][1] = std::exp(ar.values[5]);
+    ret.val[1][2] = std::exp(ar.values[6]);
+    ret.val[1][3] = std::exp(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 tanh() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::tanh(ar.values[0]);
+    ret.val[0][1] = std::tanh(ar.values[1]);
+    ret.val[0][2] = std::tanh(ar.values[2]);
+    ret.val[0][3] = std::tanh(ar.values[3]);
+    ret.val[1][0] = std::tanh(ar.values[4]);
+    ret.val[1][1] = std::tanh(ar.values[5]);
+    ret.val[1][2] = std::tanh(ar.values[6]);
+    ret.val[1][3] = std::tanh(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 er() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::erf(ar.values[0]);
+    ret.val[0][1] = std::erf(ar.values[1]);
+    ret.val[0][2] = std::erf(ar.values[2]);
+    ret.val[0][3] = std::erf(ar.values[3]);
+    ret.val[1][0] = std::erf(ar.values[4]);
+    ret.val[1][1] = std::erf(ar.values[5]);
+    ret.val[1][2] = std::erf(ar.values[6]);
+    ret.val[1][3] = std::erf(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  }
+
+  void save(float *ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    f32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x4_t reg;
+
+  explicit FP32Vec16(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+    reg.val[2] = vec_splats(v);
+    reg.val[3] = vec_splats(v);
+  }
+
+  explicit FP32Vec16() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+    reg.val[2] = vec_splats(0.0f);
+    reg.val[3] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec16(const float *ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+    reg.val[2] = vec_xl(32, ptr);
+    reg.val[3] = vec_xl(48, ptr);
+  }
+
+  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[2];
+    reg.val[3] = data.reg.val[3];
+  }
+
+  explicit FP32Vec16(const FP32Vec4 &data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  }
+
+  explicit FP32Vec16(const FP32Vec8 &data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
+  }
+
+  explicit FP32Vec16(const BF16Vec16 &v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
+    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
+    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+  }
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_mul(reg.val[0], b.reg.val[0]),
+        vec_mul(reg.val[1], b.reg.val[1]),
+        vec_mul(reg.val[2], b.reg.val[2]),
+        vec_mul(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_add(reg.val[0], b.reg.val[0]),
+        vec_add(reg.val[1], b.reg.val[1]),
+        vec_add(reg.val[2], b.reg.val[2]),
+        vec_add(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_sub(reg.val[0], b.reg.val[0]),
+        vec_sub(reg.val[1], b.reg.val[1]),
+        vec_sub(reg.val[2], b.reg.val[2]),
+        vec_sub(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(f32x4x4_t({
+        vec_div(reg.val[0], b.reg.val[0]),
+        vec_div(reg.val[1], b.reg.val[1]),
+        vec_div(reg.val[2], b.reg.val[2]),
+        vec_div(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&result, &start, ar](int i) { result += ar.values[start + i]; });
+
+    return result;
+  }
+
+  void save(float *ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+    vec_xst(reg.val[2], 32, ptr);
+    vec_xst(reg.val[3], 48, ptr);
+  }
+};
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc = acc + a * b;
+}
+
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::BFloat16 *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifndef __VEC_CLASS_FP_NAN
+#define __VEC_CLASS_FP_NAN (1 << 6)
+#endif
+
+const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#ifndef _ARCH_PWR10
+const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
+const static __vector unsigned int nan  = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
+const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
+const static __vector unsigned int one  = { 1, 1, 1, 1 };
+#endif
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
+#ifdef _ARCH_PWR10
+  __vector signed short ret[2];
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
+  reg = vec_perm(ret[0], ret[1], omask);
+#elif defined(_ARCH_PWR9)
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
+  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
+  lsb0 = vec_and(lsb0, one);
+  lsb1 = vec_and(lsb1, one);
+  __vector unsigned int rnd0 = vec_add(lsb0, bias);
+  __vector unsigned int rnd1 = vec_add(lsb1, bias);
+  inp0 = vec_add(inp0, rnd0);
+  inp1 = vec_add(inp1, rnd1);
+  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = vec_sr(inp0, sh16);
+  inp1 = vec_sr(inp1, sh16);
+  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
+#endif
+}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+#ifdef _ARCH_PWR10
+  __vector signed short ret[4];
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
+  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
+  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
+  reg.val[0] = vec_perm(ret[0], ret[1], omask);
+  reg.val[1] = vec_perm(ret[2], ret[3], omask);
+#elif defined(_ARCH_PWR9)
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
+  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
+  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
+  __vector unsigned int lsb2 = vec_sr(inp2, sh16);
+  __vector unsigned int lsb3 = vec_sr(inp3, sh16);
+  lsb0 = vec_and(lsb0, one);
+  lsb1 = vec_and(lsb1, one);
+  lsb2 = vec_and(lsb2, one);
+  lsb3 = vec_and(lsb3, one);
+  __vector unsigned int rnd0 = vec_add(lsb0, bias);
+  __vector unsigned int rnd1 = vec_add(lsb1, bias);
+  __vector unsigned int rnd2 = vec_add(lsb2, bias);
+  __vector unsigned int rnd3 = vec_add(lsb3, bias);
+  inp0 = vec_add(inp0, rnd0);
+  inp1 = vec_add(inp1, rnd1);
+  inp2 = vec_add(inp2, rnd2);
+  inp3 = vec_add(inp3, rnd3);
+  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = vec_sr(inp0, sh16);
+  inp1 = vec_sr(inp1, sh16);
+  inp2 = vec_sr(inp2, sh16);
+  inp3 = vec_sr(inp3, sh16);
+  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
+  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
+#endif
+}
+
+inline void prefetch(const void *addr) {
+  __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
+}
+
+}; // namespace vec_op
+
+#endif
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
new file mode 100644
index 000000000..f50620a52
--- /dev/null
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -0,0 +1,515 @@
+
+#ifndef CPU_TYPES_X86_HPP
+#define CPU_TYPES_X86_HPP
+
+#include <immintrin.h>
+#include <torch/all.h>
+
+#ifndef __AVX2__
+static_assert(false, "AVX2 must be supported for the current implementation.");
+#endif
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+#ifdef __AVX512FP16__
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128h reg;
+
+  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+
+  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+
+  explicit FP16Vec8(__m128h data) : reg(data) {}
+
+  FP16Vec8 operator*(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_mul_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator+(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_add_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator-(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_sub_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator/(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_div_ph(reg, b.reg));
+  }
+
+  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+};
+#endif
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128i reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  __m256i reg;
+
+  explicit BF16Vec16(const void *ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+};
+
+#ifdef __AVX512F__
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m512i reg;
+
+  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+
+  explicit BF16Vec32(__m512i data) : reg(data) {}
+
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg((__m512i)_mm512_inserti32x4(
+            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                                                      (__m128i)vec8_data.reg),
+                                                  (__m128i)vec8_data.reg, 1),
+                               (__m128i)vec8_data.reg, 2),
+            (__m128i)vec8_data.reg, 3)) {}
+
+  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
+};
+#else
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m256i reg_low;
+  __m256i reg_high;
+
+  explicit BF16Vec32(const void *ptr)
+      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
+        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
+
+  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
+                                                  reg_high(high) {}
+
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg_low((__m256i)_mm256_inserti32x4(
+                _mm256_castsi128_si256((__m128i)vec8_data.reg),
+                                       (__m128i)vec8_data.reg, 1)),
+        reg_high((__m256i)_mm256_inserti32x4(
+                _mm256_castsi128_si256((__m128i)vec8_data.reg),
+                                       (__m128i)vec8_data.reg, 1)) {}
+
+  void save(void *ptr) const {
+    *reinterpret_cast<__m256i *>(ptr) = reg_low;
+    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
+  }
+};
+#endif
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __m128 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m128 reg;
+
+  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
+
+  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
+
+  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
+
+  explicit FP32Vec4(__m128 data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    __m256 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m256 reg;
+
+  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
+
+  explicit FP32Vec8(__m256 data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+
+#ifdef __AVX512FP16__
+  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
+#endif
+
+  explicit FP32Vec8(const BF16Vec8 &v)
+      : reg(_mm256_castsi256_ps(
+            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
+                                  expf(ar.values[5]), expf(ar.values[4]),
+                                  expf(ar.values[3]), expf(ar.values[2]),
+                                  expf(ar.values[1]), expf(ar.values[0])));
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
+                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
+                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
+                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
+                                  erf(ar.values[5]), erf(ar.values[4]),
+                                  erf(ar.values[3]), erf(ar.values[2]),
+                                  erf(ar.values[1]), erf(ar.values[0])));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_add_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_div_ps(reg, b.reg));
+  }
+
+  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
+};
+
+#ifdef __AVX512F__
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m512 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m512 reg;
+
+  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
+
+  explicit FP32Vec16(__m512 data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg((__m512)_mm512_inserti32x4(
+            _mm512_inserti32x4(
+                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
+                                   (__m128i)data.reg, 1),
+                (__m128i)data.reg, 2),
+            (__m128i)data.reg, 3)) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg((__m512)_mm512_inserti32x8(
+            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const BF16Vec16 &v)
+      : reg(_mm512_castsi512_ps(
+            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_add_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_div_ps(reg, b.reg));
+  }
+
+  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
+    return _mm512_mask_reduce_add_ps(mask, reg);
+  }
+
+  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+};
+#else
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  union AliasReg {
+    __m256 reg;
+    float values[8];
+  };
+
+  __m256 reg_low;
+  __m256 reg_high;
+
+  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
+                                reg_high(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
+                         reg_high(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
+                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
+
+  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
+                                              reg_high(data.reg_high) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg_low((__m256)_mm256_inserti128_si256(
+                _mm256_castsi128_si256((__m128i)data.reg),
+                                       (__m128i)data.reg, 1)),
+        reg_high((__m256)_mm256_inserti128_si256(
+                 _mm256_castsi128_si256((__m128i)data.reg),
+                                       (__m128i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg_low(data.reg), reg_high(data.reg) {}
+
+  explicit FP32Vec16(const BF16Vec16 &v) {
+    __m128i low = _mm256_extractf128_si256(v.reg, 0);
+    __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
+    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
+
+    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
+    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
+
+    reg_low = _mm256_castsi256_ps(v_low_shifted);
+    reg_high = _mm256_castsi256_ps(v_high_shifted);
+  }
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
+                     _mm256_mul_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
+                     _mm256_add_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
+                     _mm256_sub_ps(reg_high, b.reg_high));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
+                     _mm256_div_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_sum() const {
+    FP32Vec8 low = FP32Vec8(reg_low);
+    FP32Vec8 high = FP32Vec8(reg_high);
+    return low.reduce_sum() + high.reduce_sum();
+  }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    float sum = 0.0;
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    uint32_t mask = base_mask << (idx * group_size);
+
+    AliasReg ar;
+
+    auto func = [&sum, &mask, &ar](int i) {
+      int flag = mask & 0x1;
+      mask = mask >> 1;
+      if (flag != 0) sum += ar.values[i];
+    };
+
+    ar.reg = reg_low;
+    unroll_loop<int, 8>(func);
+
+    ar.reg = reg_high;
+    unroll_loop<int, 8>(func);
+
+    return sum;
+  }
+
+  void save(float *ptr) const {
+    _mm256_storeu_ps(ptr, reg_low);
+    _mm256_storeu_ps(ptr + 8, reg_high);
+  }
+};
+#endif
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+#ifdef __AVX512FP16__
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+#endif
+
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+#ifdef __AVX512FP16__
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<_Float16 *>(ptr) = v;
+}
+#endif
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc = acc + a * b;
+}
+
+#ifdef __AVX512BF16__
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
+
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
+}
+#else
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::BFloat16 *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifdef __AVX512F__
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(_mm256_cvtepi32_epi16(
+          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg(_mm512_cvtepi32_epi16(
+          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
+#else
+namespace{
+__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
+  __m256i ai = _mm256_castps_si256(a);
+  ai = _mm256_srli_epi32(ai, 16);
+  ai = _mm256_packus_epi32(ai, ai);
+  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
+  return _mm256_extracti128_si256(ai, 0);
+}
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
+  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
+  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
+}
+#endif // __AVX512F__
+#endif // __AVX512BF16__
+
+inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+
+}; // namespace vec_op
+
+#endif
diff --git a/csrc/ops.h b/csrc/ops.h
index ae04150ea..8a92afdc8 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <optional>
 #include <torch/library.h>
 
 void paged_attention_v1(
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 21acee91d..754070df2 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.1+cpu
-torchvision == 0.18.1+cpu   # required for the image processor of phi3v, this must be updated alongside torch
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
+torch == 2.3.1+cpu; platform_machine != "ppc64le"
+torchvision == 0.18.1+cpu; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
-- 
GitLab


From 294104c3f9ab471d2d571b6a6eda266af3824d6a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 26 Jun 2024 14:57:12 -0700
Subject: [PATCH 163/376] [doc] update usage of env var to avoid conflict
 (#5873)

---
 docs/source/serving/env_vars.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/serving/env_vars.rst b/docs/source/serving/env_vars.rst
index 0ce1374a3..c665b60b4 100644
--- a/docs/source/serving/env_vars.rst
+++ b/docs/source/serving/env_vars.rst
@@ -3,6 +3,9 @@ Environment Variables
 
 vLLM uses the following environment variables to configure the system:
 
+.. warning::
+    Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work.
+
 .. literalinclude:: ../../../vllm/envs.py
     :language: python
     :start-after: begin-env-vars-definition
-- 
GitLab


From b9e84259e9083fe6a157ac0cf4834d89afcbaa6a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 26 Jun 2024 17:57:16 -0700
Subject: [PATCH 164/376] [Misc] Add example for LLaVA-NeXT (#5879)

---
 examples/llava_next_example.py | 38 ++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 examples/llava_next_example.py

diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
new file mode 100644
index 000000000..e90a86abe
--- /dev/null
+++ b/examples/llava_next_example.py
@@ -0,0 +1,38 @@
+from io import BytesIO
+
+import requests
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+from vllm.multimodal.image import ImagePixelData
+
+# Dynamic image input is currently not supported and therefore
+# a fixed image input shape and its corresponding feature size is required.
+# See https://github.com/vllm-project/vllm/pull/4199 for the complete
+# configuration matrix.
+
+llm = LLM(
+    model="llava-hf/llava-v1.6-mistral-7b-hf",
+    image_input_type="pixel_values",
+    image_token_id=32000,
+    image_input_shape="1,3,336,336",
+    image_feature_size=1176,
+)
+
+prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
+url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+image = Image.open(BytesIO(requests.get(url).content))
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
+
+outputs = llm.generate(
+    {
+        "prompt": prompt,
+        "multi_modal_data": ImagePixelData(image),
+    },
+    sampling_params=sampling_params)
+
+generated_text = ""
+for o in outputs:
+    generated_text += o.outputs[0].text
+
+print(f"LLM output:{generated_text}")
-- 
GitLab


From 2110557dabe8a18b811116c1ae9fdf75fbe27df6 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 26 Jun 2024 21:12:10 -0700
Subject: [PATCH 165/376] [BugFix] Fix cuda graph for MLPSpeculator (#5875)

Co-authored-by: Abhinav Goyal <abhinav.goyal@flipkart.com>
---
 examples/offline_inference_mlpspeculator.py | 1 -
 vllm/worker/model_runner.py                 | 9 ++++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py
index 5448ec1f6..5dec4a76a 100644
--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
@@ -52,7 +52,6 @@ if __name__ == "__main__":
         speculative_model="ibm-fms/llama-13b-accelerator",
         # These are currently required for MLPSpeculator decoding
         use_v2_block_manager=True,
-        enforce_eager=True,
     )
 
     print("With speculation")
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 9fdb2ea5d..ac820bbcb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1020,10 +1020,13 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
 
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
+            assert model_input.sampling_metadata is not None
+            indices = model_input.sampling_metadata.selected_token_indices
             if model_input.is_prompt:
-                assert model_input.sampling_metadata is not None
-                hidden_states = hidden_states.index_select(
-                    0, model_input.sampling_metadata.selected_token_indices)
+                hidden_states = hidden_states.index_select(0, indices)
+            elif decode_meta.use_cuda_graph:
+                hidden_states = hidden_states[:len(indices)]
+
             output.hidden_states = hidden_states
 
         return output
-- 
GitLab


From 6eabc6cb0e022a3f57fbf115ce53dc8bcba66840 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 14:20:01 +0800
Subject: [PATCH 166/376] [Doc] Add note about context length in Phi-3-Vision
 example (#5887)

---
 examples/phi3v_example.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 4f37c47dd..c068b9a98 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -9,6 +9,9 @@ from vllm.multimodal.image import ImagePixelData
 
 def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
+
+    # Note: The model has 128k context length by default which may cause OOM
+    # If that's the case, override `max_model_len` with a smaller value via args
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
@@ -16,7 +19,6 @@ def run_phi3v():
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
-        disable_image_processor=False,
     )
 
     image = Image.open("images/cherry_blossom.jpg")
-- 
GitLab


From d12af207d24fda5ba6cea08caa1073f9d4413938 Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Thu, 27 Jun 2024 00:15:24 -0700
Subject: [PATCH 167/376] [VLM][Bugfix] Make sure that `multi_modal_kwargs` is
 broadcasted properly (#5880)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 .buildkite/test-pipeline.yaml            |  4 +-
 tests/distributed/test_parallel_state.py | 49 ++++++++++++++++++++++++
 vllm/distributed/parallel_state.py       | 37 ++++++++++++++----
 3 files changed, 81 insertions(+), 9 deletions(-)
 create mode 100644 tests/distributed/test_parallel_state.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 10cfe35d8..fa37d0c75 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -27,7 +27,9 @@ steps:
 
 - label: Core Test
   mirror_hardwares: [amd]
-  command: pytest -v -s core
+  commands: 
+  - pytest -v -s core
+  - pytest -v -s distributed/test_parallel_state.py
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py
new file mode 100644
index 000000000..5d293b2c1
--- /dev/null
+++ b/tests/distributed/test_parallel_state.py
@@ -0,0 +1,49 @@
+from typing import Any, Dict
+
+import torch
+
+from vllm.distributed.parallel_state import (_split_tensor_dict,
+                                             _update_nested_dict)
+
+
+def test_split_tensor_dict():
+    test_dict = {
+        "key_a": "a",
+        "key_b": torch.arange(8, dtype=torch.float32),
+        "key_c": {
+            "key_1": torch.arange(5, dtype=torch.float32),
+            "key_2": torch.tensor([], dtype=torch.float32),
+            "key_3": 123,
+        },
+        "key_d": {},
+    }
+    metadata_list, tensor_list = _split_tensor_dict(test_dict)
+    assert len(metadata_list) == 6
+    assert torch.allclose(tensor_list[0], test_dict["key_b"])
+    assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
+    assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
+
+
+def test_update_nested_dict():
+    flattened_keys_values = [("key1%key2%key3", "value1"),
+                             ("key1%key2%key4", "value2"),
+                             ("key1%key5", "value3"), ("key6%key7", "value4"),
+                             ("key8", "value5")]
+    res: Dict[str, Any] = {}
+
+    # Update the nested dictionary with each flattened key-value pair
+    for flat_key, value in flattened_keys_values:
+        _update_nested_dict(res, flat_key, value)
+    assert res == {
+        "key1": {
+            "key2": {
+                "key3": "value1",
+                "key4": "value2"
+            },
+            "key5": "value3"
+        },
+        "key6": {
+            "key7": "value4"
+        },
+        "key8": "value5"
+    }
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a7a806b05..1f6b05e86 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -45,14 +45,17 @@ TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
 
 
 def _split_tensor_dict(
-    tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
-) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+        tensor_dict: Dict[Any, Union[torch.Tensor, Any]],
+        prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
          by its metadata.
     2. A list of tensors.
+
+    If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
+    metadata will be "key1%key2".
     """
-    metadata_list = []
+    metadata_list: List[Tuple[str, Any]] = []
     tensor_list = []
     for key, value in tensor_dict.items():
         if isinstance(value, torch.Tensor):
@@ -62,13 +65,31 @@ def _split_tensor_dict(
             # receiving side will set the device index.
             device = value.device.type
             metadata_list.append(
-                (key, TensorMetadata(device, value.dtype, value.size())))
+                (prefix + key, TensorMetadata(device, value.dtype,
+                                              value.size())))
             tensor_list.append(value)
+        elif isinstance(value, dict):
+            if len(value) == 0:
+                metadata_list.append((prefix + key, value))
+            inner_metadata_list, inner_tensor_list = _split_tensor_dict(
+                value, prefix + key + "%")
+            metadata_list.extend(inner_metadata_list)
+            tensor_list.extend(inner_tensor_list)
         else:
-            metadata_list.append((key, value))
+            metadata_list.append((prefix + key, value))
     return metadata_list, tensor_list
 
 
+def _update_nested_dict(nested_dict, flattened_key, value):
+    key_splits = flattened_key.split("%")
+    cur_dict = nested_dict
+    for k in key_splits[:-1]:
+        if k not in cur_dict:
+            cur_dict[k] = {}
+        cur_dict = cur_dict[k]
+    cur_dict[key_splits[-1]] = value
+
+
 class GroupCoordinator:
     """
     PyTorch ProcessGroup wrapper for a group of processes.
@@ -512,7 +533,7 @@ class GroupCoordinator:
                                          device=value.device)
                     if tensor.numel() == 0:
                         # Skip broadcasting empty tensors.
-                        tensor_dict[key] = tensor
+                        _update_nested_dict(tensor_dict, key, tensor)
                         continue
                     if tensor.is_cpu:
                         # use metadata_group for CPU tensors
@@ -528,9 +549,9 @@ class GroupCoordinator:
                                                              group=group,
                                                              async_op=True)
                     async_handles.append(handle)
-                    tensor_dict[key] = tensor
+                    _update_nested_dict(tensor_dict, key, tensor)
                 else:
-                    tensor_dict[key] = value
+                    _update_nested_dict(tensor_dict, key, value)
             for async_handle in async_handles:
                 async_handle.wait()
         return tensor_dict
-- 
GitLab


From 96354d6a2967a63eb5c0e56a2da2ead512ff1067 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 16:03:04 +0800
Subject: [PATCH 168/376] [Model] Add base class for LoRA-supported models
 (#5018)

---
 docs/source/models/lora.rst                |   3 +
 vllm/lora/lora.py                          |   3 +-
 vllm/lora/models.py                        |   6 +-
 vllm/model_executor/model_loader/loader.py |  20 ++--
 vllm/model_executor/models/baichuan.py     |  11 +-
 vllm/model_executor/models/chatglm.py      |  11 +-
 vllm/model_executor/models/decilm.py       |   4 +-
 vllm/model_executor/models/gemma.py        |  10 +-
 vllm/model_executor/models/gpt_bigcode.py  |   9 +-
 vllm/model_executor/models/interfaces.py   | 130 +++++++++++++++++++++
 vllm/model_executor/models/llama.py        |   9 +-
 vllm/model_executor/models/llava.py        |  22 ++--
 vllm/model_executor/models/llava_next.py   |  20 ++--
 vllm/model_executor/models/minicpm.py      |  12 +-
 vllm/model_executor/models/mixtral.py      |   9 +-
 vllm/model_executor/models/phi.py          |  22 ++--
 vllm/model_executor/models/qwen2.py        |  10 +-
 vllm/model_executor/models/vlm_base.py     |  12 --
 vllm/model_executor/models/xverse.py       |  11 +-
 vllm/worker/model_runner.py                |  11 +-
 20 files changed, 270 insertions(+), 75 deletions(-)
 create mode 100644 vllm/model_executor/models/interfaces.py
 delete mode 100644 vllm/model_executor/models/vlm_base.py

diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 227864048..934887a60 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -4,6 +4,9 @@ Using LoRA adapters
 ===================
 
 This document shows you how to use `LoRA adapters <https://arxiv.org/abs/2106.09685>`_ with vLLM on top of a base model.
+
+LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`.
+
 Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
 them locally with
 
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 8f3c7f769..14081b5ba 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -2,6 +2,7 @@ from typing import List, Optional
 from typing import Sequence as GenericSequence
 
 import torch
+import torch.types
 
 from vllm.utils import is_pin_memory_available
 
@@ -64,7 +65,7 @@ class LoRALayerWeights:
             output_dim: int,
             rank: int,
             dtype: torch.dtype,
-            device: torch.device,
+            device: torch.types.Device,
             embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         lora_a = torch.zeros([input_dim, rank],
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index afb9ba455..0a1fc7c02 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -18,6 +18,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA,
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
+from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.utils import LRUCache, is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -363,7 +364,7 @@ class LoRAModelManager:
 
     def __init__(
         self,
-        model: nn.Module,
+        model: SupportsLoRA,
         max_num_seqs: int,
         max_num_batched_tokens: int,
         vocab_size: int,
@@ -411,7 +412,7 @@ class LoRAModelManager:
         # embeddings_indices
         self.indices_len: List[Optional[int]] = [None] * 4
 
-        self.model: nn.Module = model
+        self.model = model
         if hasattr(self.model, "supported_lora_modules"):
             self.supported_lora_modules = copy.deepcopy(
                 self.model.supported_lora_modules)
@@ -428,7 +429,6 @@ class LoRAModelManager:
         self._active_loras: Dict[int, None] = {}
         self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
-        self.model.lora_manager = self
 
     @property
     def capacity(self) -> int:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index d3babcf9c..e91bf7cf3 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -32,7 +32,8 @@ from vllm.model_executor.model_loader.weight_utils import (
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
     pt_weights_iterator, safetensors_weights_iterator)
-from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
+from vllm.model_executor.models.interfaces import (supports_lora,
+                                                   supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import is_tpu
 
@@ -64,12 +65,15 @@ def _get_quantization_config(
 
 
 def _get_model_initialization_kwargs(
-        model_class: Type[nn.Module], lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig]
+    model_class: Type[nn.Module],
+    lora_config: Optional[LoRAConfig],
+    vlm_config: Optional[VisionLanguageConfig],
 ) -> Dict[str, Any]:
     """Get extra kwargs for model initialization."""
     extra_kwargs: Dict[str, Any] = {}
-    if hasattr(model_class, "supported_lora_modules"):
+
+    if supports_lora(model_class):
+        # lora_config=None is used to disable LoRA
         extra_kwargs["lora_config"] = lora_config
     elif lora_config:
         raise ValueError(
@@ -77,13 +81,15 @@ def _get_model_initialization_kwargs(
             "but LoRA is enabled. Support for this model may "
             "be added in the future. If this is important to you, "
             "please open an issue on github.")
-    elif issubclass(model_class, VisionLanguageModelBase):
-        if vision_language_config is None:
+
+    if supports_vision(model_class):
+        if vlm_config is None:
             raise ValueError("Provide `image_input_type` and other vision "
                              "related configurations through LLM entrypoint "
                              "or engine arguments.")
 
-        extra_kwargs["vision_language_config"] = vision_language_config
+        extra_kwargs["vlm_config"] = vlm_config
+
     return extra_kwargs
 
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index babb92e7c..abaefa3cf 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -45,6 +45,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
@@ -292,7 +294,9 @@ class BaiChuanModel(nn.Module):
         return hidden_states
 
 
-class BaiChuanBaseForCausalLM(nn.Module):
+class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {
         "W_pack": ["W_pack"],
         "gate_up_proj": [
@@ -312,14 +316,17 @@ class BaiChuanBaseForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config,
+        config: PretrainedConfig,
         position_embedding: str,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = BaiChuanModel(config, position_embedding, cache_config,
                                    quant_config)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index e3a5e43e2..bf64538ef 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -28,6 +28,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import ChatGLMConfig
 
+from .interfaces import SupportsLoRA
+
 
 class GLMAttention(nn.Module):
 
@@ -322,7 +324,9 @@ class ChatGLMModel(nn.Module):
         return hidden_states
 
 
-class ChatGLMForCausalLM(nn.Module):
+class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
@@ -345,7 +349,10 @@ class ChatGLMForCausalLM(nn.Module):
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
-        self.config: ChatGLMConfig = config
+
+        self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.max_position_embeddings = getattr(config, "max_sequence_length",
                                                8192)
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index e293ee491..65b409a2a 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -26,7 +26,7 @@
 from typing import Iterable, Optional, Tuple
 
 import torch
-from transformers import PretrainedConfig
+from transformers import LlamaConfig
 
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.model_executor.layers.quantization.base_config import (
@@ -55,7 +55,7 @@ class DeciLMForCausalLM(LlamaForCausalLM):
 
     def __init__(
         self,
-        config: Optional[PretrainedConfig] = None,
+        config: LlamaConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 65f4ebec5..9e071a155 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -41,6 +41,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 logger = init_logger(__name__)
 
 
@@ -288,7 +290,9 @@ class GemmaModel(nn.Module):
         return hidden_states
 
 
-class GemmaForCausalLM(nn.Module):
+class GemmaForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -319,9 +323,11 @@ class GemmaForCausalLM(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        del lora_config  # Unused.
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = GemmaModel(config, cache_config, quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index b15ed1198..009d7b149 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -41,6 +41,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 class GPTBigCodeAttention(nn.Module):
 
@@ -230,7 +232,9 @@ class GPTBigCodeModel(nn.Module):
         return hidden_states
 
 
-class GPTBigCodeForCausalLM(nn.Module):
+class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {"c_attn": ["c_attn"]}
 
     supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"]
@@ -250,7 +254,10 @@ class GPTBigCodeForCausalLM(nn.Module):
         lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.transformer = GPTBigCodeModel(config, cache_config, quant_config,
                                            lora_config)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
new file mode 100644
index 000000000..a9eb397a5
--- /dev/null
+++ b/vllm/model_executor/models/interfaces.py
@@ -0,0 +1,130 @@
+from typing import (ClassVar, Dict, List, Literal, Optional, Protocol, Type,
+                    Union, overload, runtime_checkable)
+
+from typing_extensions import TypeGuard
+
+from vllm.config import LoRAConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@runtime_checkable
+class SupportsVision(Protocol):
+    """The interface required for all vision language models (VLMs)."""
+
+    supports_vision: ClassVar[Literal[True]]
+
+    def __init__(self, *, vlm_config: VisionLanguageConfig) -> None:
+        ...
+
+
+# We can't use runtime_checkable with ClassVar for issubclass checks
+# so we need to treat the class as an instance and use isinstance instead
+@runtime_checkable
+class _SupportsVisionType(Protocol):
+    supports_vision: Literal[True]
+
+    def __call__(self, *, vlm_config: VisionLanguageConfig) -> None:
+        ...
+
+
+@overload
+def supports_vision(model: Type[object]) -> TypeGuard[Type[SupportsVision]]:
+    ...
+
+
+@overload
+def supports_vision(model: object) -> TypeGuard[SupportsVision]:
+    ...
+
+
+def supports_vision(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsVision]], TypeGuard[SupportsVision]]:
+    if isinstance(model, type):
+        return isinstance(model, _SupportsVisionType)
+
+    return isinstance(model, SupportsVision)
+
+
+@runtime_checkable
+class SupportsLoRA(Protocol):
+    """The interface required for all models that support LoRA."""
+
+    supports_lora: ClassVar[Literal[True]]
+
+    packed_modules_mapping: ClassVar[Dict[str, List[str]]]
+    supported_lora_modules: ClassVar[List[str]]
+    embedding_modules: ClassVar[Dict[str, str]]
+    embedding_padding_modules: ClassVar[List[str]]
+
+    # lora_config is None when LoRA is not enabled
+    def __init__(self, *, lora_config: Optional[LoRAConfig] = None) -> None:
+        ...
+
+
+# We can't use runtime_checkable with ClassVar for issubclass checks
+# so we need to treat the class as an instance and use isinstance instead
+@runtime_checkable
+class _SupportsLoRAType(Protocol):
+    supports_lora: Literal[True]
+
+    packed_modules_mapping: Dict[str, List[str]]
+    supported_lora_modules: List[str]
+    embedding_modules: Dict[str, str]
+    embedding_padding_modules: List[str]
+
+    def __call__(self, *, lora_config: Optional[LoRAConfig] = None) -> None:
+        ...
+
+
+@overload
+def supports_lora(model: Type[object]) -> TypeGuard[Type[SupportsLoRA]]:
+    ...
+
+
+@overload
+def supports_lora(model: object) -> TypeGuard[SupportsLoRA]:
+    ...
+
+
+def supports_lora(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]:
+    result = _supports_lora(model)
+
+    if not result:
+        lora_attrs = (
+            "packed_modules_mapping",
+            "supported_lora_modules",
+            "embedding_modules",
+            "embedding_padding_modules",
+        )
+        missing_attrs = tuple(attr for attr in lora_attrs
+                              if not hasattr(model, attr))
+
+        if getattr(model, "supports_lora", False):
+            if missing_attrs:
+                logger.warning(
+                    "The model (%s) sets `supports_lora=True`, "
+                    "but is missing LoRA-specific attributes: %s",
+                    model,
+                    missing_attrs,
+                )
+        else:
+            if not missing_attrs:
+                logger.warning(
+                    "The model (%s) contains all LoRA-specific attributes, "
+                    "but does not set `supports_lora=True`.", model)
+
+    return result
+
+
+def _supports_lora(
+    model: Union[Type[object], object],
+) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]:
+    if isinstance(model, type):
+        return isinstance(model, _SupportsLoRAType)
+
+    return isinstance(model, SupportsLoRA)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d83ee9a20..f4918cbfe 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -49,6 +49,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 from vllm.utils import is_hip, print_warning_once
 
+from .interfaces import SupportsLoRA
+
 
 class LlamaMLP(nn.Module):
 
@@ -296,7 +298,9 @@ class LlamaModel(nn.Module):
         return hidden_states
 
 
-class LlamaForCausalLM(nn.Module):
+class LlamaForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -336,7 +340,10 @@ class LlamaForCausalLM(nn.Module):
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.model = LlamaModel(config,
                                 cache_config,
                                 quant_config,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8e36c54b1..8e18b42b7 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -20,7 +20,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
-from .vlm_base import VisionLanguageModelBase
+from .interfaces import SupportsVision
 
 _KEYS_TO_MODIFY_MAPPING = {
     "language_model.lm_head": "lm_head",
@@ -86,18 +86,21 @@ LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 @MULTIMODAL_REGISTRY.register_image_feature_input()
 @MULTIMODAL_REGISTRY.register_image_pixel_input()
 @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
-class LlavaForConditionalGeneration(VisionLanguageModelBase):
+class LlavaForConditionalGeneration(nn.Module, SupportsVision):
+
+    supports_vision = True
 
     def __init__(self,
                  config: LlavaConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 vlm_config: VisionLanguageConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
 
         self.config = config
+        self.vlm_config = vlm_config
 
-        if self.vision_language_config.image_input_type == (
+        if self.vlm_config.image_input_type == (
                 VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
             self.vision_tower = CLIPVisionModel(config.vision_config)
         else:
@@ -122,11 +125,10 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
         self.sampler = Sampler()
 
     def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor:
-        if list(data.shape[1:]) != list(
-                self.vision_language_config.image_input_shape[1:]):
+        if list(data.shape[1:]) != list(self.vlm_config.image_input_shape[1:]):
             raise ValueError(
                 f"The expected image tensor shape is batch dimension plus "
-                f"{self.vision_language_config.image_input_shape[1:]}. "
+                f"{self.vlm_config.image_input_shape[1:]}. "
                 f"You supplied {data.shape}. "
                 f"If you are using vLLM's entrypoint, make sure your "
                 f"supplied image input is consistent with "
@@ -139,7 +141,7 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
         pixel_values = kwargs.pop("pixel_values", None)
         image_features = kwargs.pop("image_features", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
+        expected_input_type = self.vlm_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type == ImageInputType.PIXEL_VALUES:
@@ -273,7 +275,7 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
             inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
-                self.vision_language_config.image_token_id)
+                self.vlm_config.image_token_id)
 
             input_ids = None
         else:
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c1158c933..5c03fb370 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -25,8 +25,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
 from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
 from vllm.sequence import SamplerOutput, SequenceData
 
+from .interfaces import SupportsVision
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
-from .vlm_base import VisionLanguageModelBase
 
 logger = init_logger(__name__)
 
@@ -106,19 +106,21 @@ def _image_pixel_processor(
 
 @MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
 @MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
-class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
+class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
+
+    supports_vision = True
 
     def __init__(self,
                  config: LlavaNextConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 vlm_config: VisionLanguageConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
 
-        # Update the type annotation from that of its superclass
         self.config = config
+        self.vlm_config = vlm_config
 
-        if self.vision_language_config.image_input_type == (
+        if self.vlm_config.image_input_type == (
                 VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
             self.vision_tower = CLIPVisionModel(config=config.vision_config)
         else:
@@ -146,7 +148,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
             torch.empty(config.text_config.hidden_size))
 
     def _validate_image_pixels(self, data: torch.Tensor) -> torch.Tensor:
-        _, num_channels, _, _ = self.vision_language_config.image_input_shape
+        _, num_channels, _, _ = self.vlm_config.image_input_shape
 
         # Note that this is different from that of vLLM vision_language_config
         # since the image is resized by the HuggingFace preprocessor
@@ -177,7 +179,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
         image_sizes = kwargs.pop("image_sizes", None)
         image_features = kwargs.pop("image_features", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
+        expected_input_type = self.vlm_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type == ImageInputType.PIXEL_VALUES:
@@ -386,7 +388,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
 
             inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
-                self.vision_language_config.image_token_id)
+                self.vlm_config.image_token_id)
 
             input_ids = None
         else:
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 59fbf8e1b..ae17309bd 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -26,6 +26,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
@@ -51,6 +52,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 class MiniCPMMoE(nn.Module):
     """A tensor-parallel MoE implementation that shards each expert
@@ -388,7 +391,9 @@ class MiniCPMModel(nn.Module):
         return hidden_states
 
 
-class MiniCPMForCausalLM(nn.Module):
+class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -418,13 +423,16 @@ class MiniCPMForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config,
+        config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.num_experts = getattr(self.config, "num_experts", 0)
         self.quant_config = quant_config
         self.model = MiniCPMModel(config,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 3faf54d29..0bdcb21e5 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -54,6 +54,8 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import SamplerOutput
 from vllm.utils import print_warning_once
 
+from .interfaces import SupportsLoRA
+
 
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
@@ -472,7 +474,9 @@ class MixtralModel(nn.Module):
         return hidden_states
 
 
-class MixtralForCausalLM(nn.Module):
+class MixtralForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     fall_back_to_pt_during_load = False
 
     packed_modules_mapping = {
@@ -504,7 +508,10 @@ class MixtralForCausalLM(nn.Module):
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.model = MixtralModel(config,
                                   cache_config,
                                   quant_config,
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index c8e61735a..d288bdd9d 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -39,7 +39,7 @@ from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import PhiConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
@@ -59,11 +59,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 class PhiAttention(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -131,7 +133,7 @@ class PhiAttention(nn.Module):
 class PhiMLP(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
@@ -160,7 +162,7 @@ class PhiMLP(nn.Module):
 class PhiLayer(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -192,7 +194,7 @@ class PhiLayer(nn.Module):
 class PhiModel(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: PhiConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -229,7 +231,9 @@ class PhiModel(nn.Module):
         return hidden_states
 
 
-class PhiForCausalLM(nn.Module):
+class PhiForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -250,14 +254,16 @@ class PhiForCausalLM(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: PhiConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ):
-        del lora_config  # Unused.
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = PhiModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b5d13bb6b..d351adcef 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -48,6 +48,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 from vllm.utils import print_warning_once
 
+from .interfaces import SupportsLoRA
+
 
 class Qwen2MLP(nn.Module):
 
@@ -263,7 +265,9 @@ class Qwen2Model(nn.Module):
         return hidden_states
 
 
-class Qwen2ForCausalLM(nn.Module):
+class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -293,7 +297,6 @@ class Qwen2ForCausalLM(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        del lora_config
         # TODO (@robertgshaw2): see if this can be moved out
         if (cache_config.sliding_window is not None
                 and hasattr(config, "max_window_layers")):
@@ -307,7 +310,10 @@ class Qwen2ForCausalLM(nn.Module):
                              ))
 
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = Qwen2Model(config, cache_config, quant_config)
 
diff --git a/vllm/model_executor/models/vlm_base.py b/vllm/model_executor/models/vlm_base.py
deleted file mode 100644
index eb0aa96e5..000000000
--- a/vllm/model_executor/models/vlm_base.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from torch import nn
-
-from vllm.config import VisionLanguageConfig
-
-
-class VisionLanguageModelBase(nn.Module):
-    """Base class for all vision language models (VLMs)."""
-
-    def __init__(self, vision_language_config: VisionLanguageConfig) -> None:
-        super().__init__()
-
-        self.vision_language_config = vision_language_config
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 1e5280dde..639c3443b 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -45,6 +45,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsLoRA
+
 
 class XverseMLP(nn.Module):
 
@@ -266,7 +268,9 @@ class XverseModel(nn.Module):
         return hidden_states
 
 
-class XverseForCausalLM(nn.Module):
+class XverseForCausalLM(nn.Module, SupportsLoRA):
+    supports_lora = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -299,10 +303,13 @@ class XverseForCausalLM(nn.Module):
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
-        lora_config=None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
+
         self.config = config
+        self.lora_config = lora_config
+
         self.quant_config = quant_config
         self.model = XverseModel(config, cache_config, quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ac820bbcb..181442490 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -22,6 +22,7 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.model_executor.models.interfaces import supports_lora
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
@@ -225,14 +226,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     self.model_memory_usage / float(2**30))
 
         if self.lora_config:
-            assert hasattr(self.model, "supported_lora_modules"
-                           ) and self.model.supported_lora_modules, (
-                               "Model does not support LoRA")
-            assert hasattr(
-                self.model,
-                "embedding_modules"), "Model does not have embedding_modules"
-            assert hasattr(self.model, "embedding_padding_modules"
-                           ), "Model does not have embedding_padding_modules"
+            assert supports_lora(self.model), "Model does not support LoRA"
+
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
                 self.scheduler_config.max_num_batched_tokens,
-- 
GitLab


From 2061f0b8a7f1a01683c4045096a092eedf6387a4 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 27 Jun 2024 01:29:24 -0700
Subject: [PATCH 169/376] [Bugfix] Fix img_sizes Parsing in Phi3-Vision (#5888)

---
 vllm/model_executor/models/phi3v.py | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index dac832a68..578e22bea 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -65,12 +65,6 @@ class Phi3ImageEmbeddingBase(nn.Module):
         self.type_feature: str
         self.img_processor: CLIPVisionModel
 
-    def set_img_features(self, img_features: torch.FloatTensor) -> None:
-        self.img_features = img_features
-
-    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
-        self.img_sizes = img_sizes
-
     def get_img_features(self,
                          img_embeds: torch.FloatTensor) -> torch.FloatTensor:
         LAYER_IDX = self.layer_idx
@@ -144,21 +138,16 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
         self.layer_idx = config.img_processor.get('layer_idx', -2)
         self.type_feature = config.img_processor.get('type_feature', 'patch')
 
-    def forward(self,
-                input_ids: torch.LongTensor,
+    def forward(self, input_ids: torch.LongTensor,
                 pixel_values: torch.FloatTensor,
-                image_sizes=None) -> torch.FloatTensor:
+                image_sizes: torch.Tensor) -> torch.FloatTensor:
         """process and merge text embeddings with image embeddings."""
 
+        # (batch_size, max_num_crops, 3, height, width)
         img_embeds = pixel_values
-        img_sizes = image_sizes
 
-        if self.img_features is not None:
-            img_embeds = self.img_features.clone()
-            self.img_features = None
-
-        if self.img_sizes is not None:
-            img_sizes = self.img_sizes
+        # (batch_size, 2)
+        img_sizes = image_sizes
 
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_shape[-1])
@@ -190,11 +179,8 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
             output_imgs = []
             output_len = []
 
-            if isinstance(img_sizes, torch.Tensor):
-                img_sizes.squeeze_(0)
-
             for _bs in range(bs):
-                h, w = img_sizes
+                h, w = img_sizes[_bs]
                 h = h // 336
                 w = w // 336
                 B_ = h * w
-- 
GitLab


From e9d32d077da2f914c965a66dd2fdfc77c50d117f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 20:43:17 +0800
Subject: [PATCH 170/376] [CI/Build] [1/3] Reorganize entrypoints tests (#5526)

---
 tests/entrypoints/test_openai_chat.py   | 875 ++++++++++++++++++++++++
 tests/entrypoints/test_openai_server.py | 791 +--------------------
 tests/entrypoints/test_openai_vision.py |   4 +-
 3 files changed, 896 insertions(+), 774 deletions(-)
 create mode 100644 tests/entrypoints/test_openai_chat.py

diff --git a/tests/entrypoints/test_openai_chat.py b/tests/entrypoints/test_openai_chat.py
new file mode 100644
index 000000000..30455e720
--- /dev/null
+++ b/tests/entrypoints/test_openai_chat.py
@@ -0,0 +1,875 @@
+# imports for guided decoding tests
+import json
+import re
+from typing import List
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
+import torch
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+TEST_CHOICE = [
+    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
+    "Swift", "Kotlin"
+]
+
+pytestmark = pytest.mark.openai
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, ray_ctx):
+    return RemoteOpenAIServer([
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ])
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=5,
+                                                           temperature=0.0,
+                                                           logprobs=False)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=5,
+                                                           temperature=0.0,
+                                                           logprobs=True,
+                                                           top_logprobs=0)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.content is not None
+    assert len(choice.logprobs.content[0].top_logprobs) == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=5,
+                                                           temperature=0.0,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.content is not None
+    assert len(choice.logprobs.content[0].top_logprobs) == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
+                                      model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # Default max_logprobs is 20, so this should raise an error
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        stream = await client.chat.completions.create(model=model_name,
+                                                      messages=messages,
+                                                      max_tokens=10,
+                                                      logprobs=True,
+                                                      top_logprobs=21,
+                                                      stream=True)
+        async for chunk in stream:
+            ...
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=model_name,
+                                             messages=messages,
+                                             max_tokens=10,
+                                             logprobs=True,
+                                             top_logprobs=30,
+                                             stream=False)
+
+    # the server should still work afterwards
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           stream=False)
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(client: openai.AsyncOpenAI,
+                                   model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=37, total_tokens=47)
+
+    message = choice.message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
+                                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True})
+
+
+# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
+# (i.e. using the same ordering as in the Completions API tests), the test
+# will fail on the second `guided_decoding_backend` even when I swap their order
+# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in TEST_CHOICE
+
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({
+        "role": "user",
+        "content": "I disagree, pick another one"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in TEST_CHOICE
+    assert choice1 != choice2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
+                                guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json1 = json.loads(message.content)
+    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+
+    messages.append({"role": "assistant", "content": message.content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json2 = json.loads(message.content)
+    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_chat(client: openai.AsyncOpenAI,
+                                 guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example IP address with this regex: {TEST_REGEX}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
+    ip1 = chat_completion.choices[0].message.content
+    assert ip1 is not None
+    assert re.fullmatch(TEST_REGEX, ip1) is not None
+
+    messages.append({"role": "assistant", "content": ip1})
+    messages.append({"role": "user", "content": "Give me a different one"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
+    ip2 = chat_completion.choices[0].message.content
+    assert ip2 is not None
+    assert re.fullmatch(TEST_REGEX, ip2) is not None
+    assert ip1 != ip2
+
+
+@pytest.mark.asyncio
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=MODEL_NAME,
+                                                 messages=messages,
+                                                 extra_body=dict(guided_regex={
+                                                     1: "Python",
+                                                     2: "C++"
+                                                 }))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
+                                           guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.content is not None
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    # -9999.0 is the minimum logprob returned by OpenAI
+    for item in top_logprobs:
+        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_named_tool_use(client: openai.AsyncOpenAI,
+                              guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+
+    # non-streaming
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": TEST_SCHEMA
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        })
+    message = chat_completion.choices[0].message
+    assert len(message.content) == 0
+    json_string = message.tool_calls[0].function.arguments
+    json1 = json.loads(json_string)
+    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+
+    messages.append({"role": "assistant", "content": json_string})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+
+    # streaming
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": TEST_SCHEMA
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        },
+        stream=True)
+
+    output = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        assert delta.content is None or len(delta.content) == 0
+        if delta.tool_calls:
+            output.append(delta.tool_calls[0].function.arguments)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    json2 = json.loads("".join(output))
+    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+async def test_required_tool_use_not_yet_supported(
+        client: openai.AsyncOpenAI, guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice="required")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice="auto")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+async def test_inconsistent_tool_choice_and_tools(
+        client: openai.AsyncOpenAI, guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=MODEL_NAME,
+                                             messages=messages,
+                                             max_tokens=1000,
+                                             tool_choice={
+                                                 "type": "function",
+                                                 "function": {
+                                                     "name":
+                                                     "dummy_function_name"
+                                                 }
+                                             })
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice={
+                "type": "function",
+                "function": {
+                    "name": "nondefined_function_name"
+                }
+            })
+
+
+@pytest.mark.asyncio
+async def test_response_format_json_object(client: openai.AsyncOpenAI):
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role":
+                "user",
+                "content": ('what is 1+1? please respond with a JSON object, '
+                            'the format is {"result": 2}')
+            }],
+            response_format={"type": "json_object"})
+
+        content = resp.choices[0].message.content
+        assert content is not None
+
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
+
+
+@pytest.mark.asyncio
+async def test_extra_fields(client: openai.AsyncOpenAI):
+    with pytest.raises(BadRequestError) as exc_info:
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "system",
+                "content": "You are a helpful assistant.",
+                "extra_field": "0",
+            }],  # type: ignore
+            temperature=0,
+            seed=0)
+
+    assert "extra_forbidden" in exc_info.value.message
+
+
+@pytest.mark.asyncio
+async def test_complex_message_content(client: openai.AsyncOpenAI):
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role":
+            "user",
+            "content": [{
+                "type":
+                "text",
+                "text":
+                "what is 1+1? please provide the result without any other text."
+            }]
+        }],
+        temperature=0,
+        seed=0)
+    content = resp.choices[0].message.content
+    assert content == "2"
+
+
+@pytest.mark.asyncio
+async def test_custom_role(client: openai.AsyncOpenAI):
+    # Not sure how the model handles custom roles so we just check that
+    # both string and complex message content are handled in the same way
+
+    resp1 = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "my-custom-role",
+            "content": "what is 1+1?",
+        }],  # type: ignore
+        temperature=0,
+        seed=0)
+
+    resp2 = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "my-custom-role",
+            "content": [{
+                "type": "text",
+                "text": "what is 1+1?"
+            }]
+        }],  # type: ignore
+        temperature=0,
+        seed=0)
+
+    content1 = resp1.choices[0].message.content
+    content2 = resp2.choices[0].message.content
+    assert content1 == content2
+
+
+@pytest.mark.asyncio
+async def test_long_seed(client: openai.AsyncOpenAI):
+    for seed in [
+            torch.iinfo(torch.long).min - 1,
+            torch.iinfo(torch.long).max + 1
+    ]:
+        with pytest.raises(BadRequestError) as exc_info:
+            await client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=[{
+                    "role": "system",
+                    "content": "You are a helpful assistant.",
+                }],
+                temperature=0,
+                seed=seed)
+
+        assert ("greater_than_equal" in exc_info.value.message
+                or "less_than_equal" in exc_info.value.message)
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 4d9bfb460..14f59ea66 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -10,7 +10,6 @@ import pytest
 # and debugging.
 import ray
 import requests
-import torch
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
@@ -117,6 +116,7 @@ def client(server):
     return server.get_async_client()
 
 
+@pytest.mark.asyncio
 async def test_check_models(client: openai.AsyncOpenAI):
     models = await client.models.list()
     models = models.data
@@ -266,174 +266,6 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
     assert len(completion.choices[0].text) >= 0
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=5,
-                                                           temperature=0.0,
-                                                           logprobs=False)
-
-    choice = chat_completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=5,
-                                                           temperature=0.0,
-                                                           logprobs=True,
-                                                           top_logprobs=0)
-
-    choice = chat_completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.content is not None
-    assert len(choice.logprobs.content[0].top_logprobs) == 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=5,
-                                                           temperature=0.0,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-
-    choice = chat_completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.content is not None
-    assert len(choice.logprobs.content[0].top_logprobs) == 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
-                                      model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # Default max_logprobs is 20, so this should raise an error
-    with pytest.raises((openai.BadRequestError, openai.APIError)):
-        stream = await client.chat.completions.create(model=model_name,
-                                                      messages=messages,
-                                                      max_tokens=10,
-                                                      logprobs=True,
-                                                      top_logprobs=21,
-                                                      stream=True)
-        async for chunk in stream:
-            ...
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(model=model_name,
-                                             messages=messages,
-                                             max_tokens=10,
-                                             logprobs=True,
-                                             top_logprobs=30,
-                                             stream=False)
-
-    # the server should still work afterwards
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           stream=False)
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_single_chat_session(client: openai.AsyncOpenAI,
-                                   model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # test single completion
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
-
-    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
-    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47)
-
-    message = choice.message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
-    messages.append({"role": "assistant", "content": message.content})
-
-    # test multi-turn dialogue
-    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
@@ -468,126 +300,6 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
     assert "".join(chunks) == single_output
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # test single completion
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-    )
-    output = chat_completion.choices[0].message.content
-    stop_reason = chat_completion.choices[0].finish_reason
-
-    # test streaming
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-    )
-    chunks: List[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        delta = chunk.choices[0].delta
-        if delta.role:
-            assert delta.role == "assistant"
-        if delta.content:
-            chunks.append(delta.content)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == stop_reason
-    assert delta.content
-    assert "".join(chunks) == output
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
-)
-async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
-                                              model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?"
-    }]
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
-
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options={"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options={"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": True})
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
@@ -773,53 +485,6 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
         jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_chat(client: openai.AsyncOpenAI,
-                                guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
-    }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=1000,
-        extra_body=dict(guided_json=TEST_SCHEMA,
-                        guided_decoding_backend=guided_decoding_backend))
-    message = chat_completion.choices[0].message
-    assert message.content is not None
-    json1 = json.loads(message.content)
-    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
-
-    messages.append({"role": "assistant", "content": message.content})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "Give me another one with a different name and age"
-    })
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=1000,
-        extra_body=dict(guided_json=TEST_SCHEMA,
-                        guided_decoding_backend=guided_decoding_backend))
-    message = chat_completion.choices[0].message
-    assert message.content is not None
-    json2 = json.loads(message.content)
-    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
-    assert json1["name"] != json2["name"]
-    assert json1["age"] != json2["age"]
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
@@ -840,44 +505,6 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
         assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example IP address with this regex: {TEST_REGEX}"
-    }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
-                        guided_decoding_backend=guided_decoding_backend))
-    ip1 = chat_completion.choices[0].message.content
-    assert ip1 is not None
-    assert re.fullmatch(TEST_REGEX, ip1) is not None
-
-    messages.append({"role": "assistant", "content": ip1})
-    messages.append({"role": "user", "content": "Give me a different one"})
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
-                        guided_decoding_backend=guided_decoding_backend))
-    ip2 = chat_completion.choices[0].message.content
-    assert ip2 is not None
-    assert re.fullmatch(TEST_REGEX, ip2) is not None
-    assert ip1 != ip2
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
@@ -898,385 +525,6 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI,
         assert completion.choices[i].text in TEST_CHOICE
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-    choice1 = chat_completion.choices[0].message.content
-    assert choice1 in TEST_CHOICE
-
-    messages.append({"role": "assistant", "content": choice1})
-    messages.append({
-        "role": "user",
-        "content": "I disagree, pick another one"
-    })
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-    choice2 = chat_completion.choices[0].message.content
-    assert choice2 in TEST_CHOICE
-    assert choice1 != choice2
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str):
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=MODEL_NAME,
-                                                 messages=messages,
-                                                 extra_body=dict(guided_regex={
-                                                     1: "Python",
-                                                     2: "C++"
-                                                 }))
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        logprobs=True,
-        top_logprobs=5,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert chat_completion.choices[0].logprobs is not None
-    assert chat_completion.choices[0].logprobs.content is not None
-    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
-
-    # -9999.0 is the minimum logprob returned by OpenAI
-    for item in top_logprobs:
-        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_named_tool_use(client: openai.AsyncOpenAI,
-                              guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
-    }]
-
-    # non-streaming
-
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": TEST_SCHEMA
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
-            }
-        })
-    message = chat_completion.choices[0].message
-    assert len(message.content) == 0
-    json_string = message.tool_calls[0].function.arguments
-    json1 = json.loads(json_string)
-    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
-
-    messages.append({"role": "assistant", "content": json_string})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "Give me another one with a different name and age"
-    })
-
-    # streaming
-
-    stream = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": TEST_SCHEMA
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
-            }
-        },
-        stream=True)
-
-    output = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        delta = chunk.choices[0].delta
-        if delta.role:
-            assert delta.role == "assistant"
-        assert delta.content is None or len(delta.content) == 0
-        if delta.tool_calls:
-            output.append(delta.tool_calls[0].function.arguments)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    json2 = json.loads("".join(output))
-    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
-    assert json1["name"] != json2["name"]
-    assert json1["age"] != json2["age"]
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_required_tool_use_not_yet_supported(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
-    }]
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
-                }
-            }],
-            tool_choice="required")
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
-                }
-            }],
-            tool_choice="auto")
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_inconsistent_tool_choice_and_tools(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
-    }]
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(model=MODEL_NAME,
-                                             messages=messages,
-                                             max_tokens=1000,
-                                             tool_choice={
-                                                 "type": "function",
-                                                 "function": {
-                                                     "name":
-                                                     "dummy_function_name"
-                                                 }
-                                             })
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
-                }
-            }],
-            tool_choice={
-                "type": "function",
-                "function": {
-                    "name": "nondefined_function_name"
-                }
-            })
-
-
-@pytest.mark.asyncio
-async def test_response_format_json_object(client: openai.AsyncOpenAI):
-    for _ in range(2):
-        resp = await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[{
-                "role":
-                "user",
-                "content": ('what is 1+1? please respond with a JSON object, '
-                            'the format is {"result": 2}')
-            }],
-            response_format={"type": "json_object"})
-
-        content = resp.choices[0].message.content
-        assert content is not None
-
-        loaded = json.loads(content)
-        assert loaded == {"result": 2}, loaded
-
-
-@pytest.mark.asyncio
-async def test_extra_fields(client: openai.AsyncOpenAI):
-    with pytest.raises(BadRequestError) as exc_info:
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[{
-                "role": "system",
-                "content": "You are a helpful assistant.",
-                "extra_field": "0",
-            }],  # type: ignore
-            temperature=0,
-            seed=0)
-
-    assert "extra_forbidden" in exc_info.value.message
-
-
-@pytest.mark.asyncio
-async def test_complex_message_content(client: openai.AsyncOpenAI):
-    resp = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=[{
-            "role":
-            "user",
-            "content": [{
-                "type":
-                "text",
-                "text":
-                "what is 1+1? please provide the result without any other text."
-            }]
-        }],
-        temperature=0,
-        seed=0)
-    content = resp.choices[0].message.content
-    assert content == "2"
-
-
-@pytest.mark.asyncio
-async def test_custom_role(client: openai.AsyncOpenAI):
-    # Not sure how the model handles custom roles so we just check that
-    # both string and complex message content are handled in the same way
-
-    resp1 = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=[{
-            "role": "my-custom-role",
-            "content": "what is 1+1?",
-        }],  # type: ignore
-        temperature=0,
-        seed=0)
-
-    resp2 = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=[{
-            "role": "my-custom-role",
-            "content": [{
-                "type": "text",
-                "text": "what is 1+1?"
-            }]
-        }],  # type: ignore
-        temperature=0,
-        seed=0)
-
-    content1 = resp1.choices[0].message.content
-    content2 = resp2.choices[0].message.content
-    assert content1 == content2
-
-
 @pytest.mark.asyncio
 async def test_guided_grammar(client: openai.AsyncOpenAI):
     simple_sql_grammar = """
@@ -1348,23 +596,22 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_long_seed(client: openai.AsyncOpenAI):
-    for seed in [
-            torch.iinfo(torch.long).min - 1,
-            torch.iinfo(torch.long).max + 1
-    ]:
-        with pytest.raises(BadRequestError) as exc_info:
-            await client.chat.completions.create(
-                model=MODEL_NAME,
-                messages=[{
-                    "role": "system",
-                    "content": "You are a helpful assistant.",
-                }],
-                temperature=0,
-                seed=seed)
-
-        assert ("greater_than_equal" in exc_info.value.message
-                or "less_than_equal" in exc_info.value.message)
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
 
 
 @pytest.mark.asyncio
@@ -1372,7 +619,7 @@ async def test_long_seed(client: openai.AsyncOpenAI):
     "model_name",
     [MODEL_NAME],
 )
-async def test_tokenize(server, client: openai.AsyncOpenAI, model_name: str):
+async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
     base_url = str(client.base_url)[:-3]
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
 
@@ -1399,7 +646,7 @@ async def test_tokenize(server, client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME],
 )
-async def test_detokenize(server, client: openai.AsyncOpenAI, model_name: str):
+async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
     base_url = str(client.base_url)[:-3]
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
 
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py
index 0e8d88b76..df092680a 100644
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/test_openai_vision.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict
+from typing import Dict, List
 
 import openai
 import pytest
@@ -216,7 +216,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
         temperature=0.0,
         stream=True,
     )
-    chunks = []
+    chunks: List[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
-- 
GitLab


From 98cf2ed678580326ffc39c987304c61cb0ce4981 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 00:08:10 +0800
Subject: [PATCH 171/376] [Model][Bugfix] Implicit model flags and reenable
 Phi-3-Vision (#5896)

---
 vllm/model_executor/models/baichuan.py    |  2 --
 vllm/model_executor/models/chatglm.py     |  2 --
 vllm/model_executor/models/gemma.py       |  2 --
 vllm/model_executor/models/gpt_bigcode.py |  2 --
 vllm/model_executor/models/interfaces.py  | 18 ++++++++++++++++--
 vllm/model_executor/models/llama.py       |  2 --
 vllm/model_executor/models/llava.py       |  2 --
 vllm/model_executor/models/llava_next.py  |  2 --
 vllm/model_executor/models/minicpm.py     |  2 --
 vllm/model_executor/models/mixtral.py     |  2 --
 vllm/model_executor/models/phi.py         |  2 --
 vllm/model_executor/models/phi3v.py       | 16 ++++++++++------
 vllm/model_executor/models/qwen2.py       |  2 --
 vllm/model_executor/models/xverse.py      |  2 --
 14 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index abaefa3cf..5cf5a199b 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -295,8 +295,6 @@ class BaiChuanModel(nn.Module):
 
 
 class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {
         "W_pack": ["W_pack"],
         "gate_up_proj": [
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index bf64538ef..5b5a69447 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -325,8 +325,6 @@ class ChatGLMModel(nn.Module):
 
 
 class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 9e071a155..ce97fc808 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -291,8 +291,6 @@ class GemmaModel(nn.Module):
 
 
 class GemmaForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 009d7b149..17bbe4e31 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -233,8 +233,6 @@ class GPTBigCodeModel(nn.Module):
 
 
 class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {"c_attn": ["c_attn"]}
 
     supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"]
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index a9eb397a5..cb0fc154a 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -13,7 +13,14 @@ logger = init_logger(__name__)
 class SupportsVision(Protocol):
     """The interface required for all vision language models (VLMs)."""
 
-    supports_vision: ClassVar[Literal[True]]
+    supports_vision: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports vision inputs.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
 
     def __init__(self, *, vlm_config: VisionLanguageConfig) -> None:
         ...
@@ -52,7 +59,14 @@ def supports_vision(
 class SupportsLoRA(Protocol):
     """The interface required for all models that support LoRA."""
 
-    supports_lora: ClassVar[Literal[True]]
+    supports_lora: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports LoRA.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
 
     packed_modules_mapping: ClassVar[Dict[str, List[str]]]
     supported_lora_modules: ClassVar[List[str]]
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f4918cbfe..54d01701f 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -299,8 +299,6 @@ class LlamaModel(nn.Module):
 
 
 class LlamaForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8e18b42b7..125e3ddea 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -88,8 +88,6 @@ LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
-    supports_vision = True
-
     def __init__(self,
                  config: LlavaConfig,
                  vlm_config: VisionLanguageConfig,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 5c03fb370..841818d8d 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -108,8 +108,6 @@ def _image_pixel_processor(
 @MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
-    supports_vision = True
-
     def __init__(self,
                  config: LlavaNextConfig,
                  vlm_config: VisionLanguageConfig,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index ae17309bd..a76ed0498 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -392,8 +392,6 @@ class MiniCPMModel(nn.Module):
 
 
 class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0bdcb21e5..a662db6d2 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -475,8 +475,6 @@ class MixtralModel(nn.Module):
 
 
 class MixtralForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     fall_back_to_pt_during_load = False
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index d288bdd9d..008fceb62 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -232,8 +232,6 @@ class PhiModel(nn.Module):
 
 
 class PhiForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 578e22bea..0bbe93241 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,12 +32,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
-from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
+from .interfaces import SupportsVision
+
 logger = init_logger(__name__)
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -317,18 +318,21 @@ def _image_processor(
 
 @MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor)
 @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
-class Phi3VForCausalLM(VisionLanguageModelBase):
+class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: PretrainedConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 vlm_config: VisionLanguageConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
+
         self.config = config
+        self.vlm_config = vlm_config
+
         self.model = LlamaModel(config, cache_config, quant_config)
         self.vision_embed_tokens = Phi3HDImageEmbedding(
-            vision_language_config, config, self.model.embed_tokens)
+            vlm_config, config, self.model.embed_tokens)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -338,7 +342,7 @@ class Phi3VForCausalLM(VisionLanguageModelBase):
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
+        expected_input_type = self.vlm_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type != ImageInputType.PIXEL_VALUES:
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index d351adcef..e2d725af6 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -266,8 +266,6 @@ class Qwen2Model(nn.Module):
 
 
 class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 639c3443b..b61721999 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -269,8 +269,6 @@ class XverseModel(nn.Module):
 
 
 class XverseForCausalLM(nn.Module, SupportsLoRA):
-    supports_lora = True
-
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
-- 
GitLab


From 3fd02bda51ee7cf07e0375994ac1f34b6d1b981b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 27 Jun 2024 10:07:07 -0700
Subject: [PATCH 172/376] [doc][misc] add note for Kubernetes users (#5916)

---
 docs/source/serving/env_vars.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/serving/env_vars.rst b/docs/source/serving/env_vars.rst
index c665b60b4..ff2259c0d 100644
--- a/docs/source/serving/env_vars.rst
+++ b/docs/source/serving/env_vars.rst
@@ -6,6 +6,8 @@ vLLM uses the following environment variables to configure the system:
 .. warning::
     Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work.
 
+    All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix <https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables>`_.
+
 .. literalinclude:: ../../../vllm/envs.py
     :language: python
     :start-after: begin-env-vars-definition
-- 
GitLab


From 691e29ecf356d2646f74c622d957ec43dbf95c3a Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 27 Jun 2024 10:59:33 -0700
Subject: [PATCH 173/376] [BugFix] Fix `MLPSpeculator` handling of
 `num_speculative_tokens` (#5876)

---
 vllm/config.py                                    | 10 +++++++---
 vllm/model_executor/models/mlp_speculator.py      | 15 ++++++++-------
 vllm/transformers_utils/configs/mlp_speculator.py |  3 +++
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 0c4d770e4..119cb982f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -920,15 +920,19 @@ class SpeculativeConfig:
                 max_logprobs=target_model_config.max_logprobs,
             )
 
-            if (draft_model_config.hf_config.model_type == "mlp_speculator"
+            draft_hf_config = draft_model_config.hf_config
+            if (draft_hf_config.model_type == "mlp_speculator"
                     and target_parallel_config.world_size != 1):
                 # MLPSpeculator TP support will be added very soon
                 raise ValueError(
                     "Speculative decoding with mlp_speculator models does not "
                     "yet support distributed inferencing (TP > 1).")
 
-            n_predict = getattr(draft_model_config.hf_config, "n_predict",
-                                None)
+            if (num_speculative_tokens is not None
+                    and hasattr(draft_hf_config, "num_lookahead_tokens")):
+                draft_hf_config.num_lookahead_tokens = num_speculative_tokens
+
+            n_predict = getattr(draft_hf_config, "n_predict", None)
             if n_predict is not None:
                 if num_speculative_tokens is None:
                     # Default to max value defined in draft model config.
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index b18269777..6e6b2d8a7 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs import MLPSpeculatorConfig
 
 
 class MLPSpeculatorLayerNorm(nn.Module):
@@ -48,7 +49,7 @@ class MLPSpeculatorLayerNorm(nn.Module):
 
 class MLPSpeculator(nn.Module):
 
-    def __init__(self, config, **kwargs) -> None:
+    def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
         super().__init__()
         self.n_predict = config.n_predict
         self.vocab_size = config.vocab_size
@@ -56,8 +57,7 @@ class MLPSpeculator(nn.Module):
         self.inner_dim = config.inner_dim if config.inner_dim != 0 \
             else config.emb_dim
 
-        self.max_speculative_tokens = getattr(config, "max_speculative_tokens",
-                                              self.n_predict)
+        self.max_speculative_tokens = config.num_lookahead_tokens
 
         self.emb = nn.ModuleList([
             VocabParallelEmbedding(config.vocab_size,
@@ -137,7 +137,8 @@ class MLPSpeculator(nn.Module):
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            param = params_dict[name.replace("speculator.", "")]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
+            param = params_dict.get(name.replace("speculator.", ""))
+            if param is not None:
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index dd1d92b86..e1c1f4a96 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -35,6 +35,7 @@ class MLPSpeculatorConfig(PretrainedConfig):
                 candidate tree.
                 For each candidate branch in the tree, head n produces topk[n]
                 additional sub-branches.
+                NOTE: This parameter is currently unused.
             n_candidates: int
                 number of child candidates to create per sequence
         """
@@ -47,4 +48,6 @@ class MLPSpeculatorConfig(PretrainedConfig):
         self.n_predict = n_predict
         self.top_k_tokens_per_head = top_k_tokens_per_head
         self.n_candidates = n_candidates
+        self.num_lookahead_tokens = n_predict
+
         super().__init__(**kwargs)
-- 
GitLab


From 365791ff81181d0d08b719cc6ff976889f6e3288 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 27 Jun 2024 11:31:11 -0700
Subject: [PATCH 174/376] [BugFix] Fix `min_tokens` behaviour for multiple eos
 tokens (#5849)

---
 vllm/engine/llm_engine.py |  7 ++-----
 vllm/sampling_params.py   | 29 +++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 0ad957ef9..4b427b1fb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -606,12 +606,9 @@ class LLMEngine:
         # Defensive copy of SamplingParams, which are used by the sampler,
         # this doesn't deep-copy LogitsProcessor objects
         sampling_params = sampling_params.clone()
-        # Add the eos token id into the sampling_params to support min_tokens
-        # processing
-        if seq.eos_token_id is not None:
-            sampling_params.all_stop_token_ids.add(seq.eos_token_id)
+
         sampling_params.update_from_generation_config(
-            self.generation_config_fields)
+            self.generation_config_fields, seq.eos_token_id)
 
         # Create the sequence group.
         seq_group = SequenceGroup(
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 9d8a36135..a2caae21a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -280,17 +280,30 @@ class SamplingParams:
                              f"Got {self.best_of}.")
 
     def update_from_generation_config(
-            self, generation_config: Dict[str, Any]) -> None:
+            self,
+            generation_config: Dict[str, Any],
+            model_eos_token_id: Optional[int] = None) -> None:
         """Update if there are non-default values from generation_config"""
+
+        if model_eos_token_id is not None:
+            # Add the eos token id into the sampling_params to support
+            # min_tokens processing.
+            self.all_stop_token_ids.add(model_eos_token_id)
+
         # Update eos_token_id for generation
-        if (not self.ignore_eos) and (eos_ids :=
-                                      generation_config.get("eos_token_id")):
+        if (eos_ids := generation_config.get("eos_token_id")) is not None:
             # it can be either int or list of int
-            if isinstance(eos_ids, int):
-                eos_ids = [eos_ids]
-            original_stop_token_ids = set(self.stop_token_ids)
-            original_stop_token_ids.update(eos_ids)
-            self.stop_token_ids = list(original_stop_token_ids)
+            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+            if model_eos_token_id is not None:
+                # We don't need to include the primary eos_token_id in
+                # stop_token_ids since it's handled separately for stopping
+                # purposes.
+                eos_ids.discard(model_eos_token_id)
+            if eos_ids:
+                self.all_stop_token_ids.update(eos_ids)
+                if not self.ignore_eos:
+                    eos_ids.update(self.stop_token_ids)
+                    self.stop_token_ids = list(eos_ids)
 
     @cached_property
     def sampling_type(self) -> SamplingType:
-- 
GitLab


From 736ed388492c5c10deb7522637a94c041f163f48 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 27 Jun 2024 11:43:04 -0700
Subject: [PATCH 175/376] [CI/Build] Fix Args for `_get_logits_warper` in
 Sampler Test (#5922)

---
 tests/samplers/test_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 0aabde6aa..9572588ce 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -587,7 +587,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     generation_config = GenerationConfig(top_k=top_k,
                                          top_p=top_p,
                                          do_sample=True)
-    warpers = generation_model._get_logits_warper(generation_config)
+    warpers = generation_model._get_logits_warper(generation_config, device)
     assert len(warpers) == 2  # top_p and top_k
 
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
-- 
GitLab


From 79c92c7c8aa6a881421e2007ab216a819f61bc9b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Jun 2024 13:33:56 -0700
Subject: [PATCH 176/376] [Model] Add Gemma 2 (#5908)

---
 docs/source/models/supported_models.rst       |   4 +
 requirements-common.txt                       |   2 +-
 vllm/config.py                                |  30 +-
 vllm/lora/layers.py                           |   4 +
 vllm/model_executor/layers/layernorm.py       |  46 ++
 .../model_executor/layers/logits_processor.py |  10 +-
 .../model_executor/layers/rotary_embedding.py |  10 +
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/gemma2.py          | 401 ++++++++++++++++++
 9 files changed, 499 insertions(+), 9 deletions(-)
 create mode 100644 vllm/model_executor/models/gemma2.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 47737ae52..544322582 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -55,6 +55,10 @@ Alongside each architecture, we include some popular models that use it.
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
     - ✅︎
+  * - :code:`Gemma2ForCausalLM`
+    - Gemma2
+    - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
+    - ✅︎
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
diff --git a/requirements-common.txt b/requirements-common.txt
index 05969cfa5..636f85343 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
+transformers >= 4.42.0  # Required for Gemma 2.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
diff --git a/vllm/config.py b/vllm/config.py
index 119cb982f..9a98a7fbc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -14,7 +14,7 @@ from vllm.model_executor.models import ModelRegistry
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_tpu, is_xpu,
+                        is_hip, is_neuron, is_tpu, is_xpu, print_warning_once,
                         update_environment_variables)
 
 if TYPE_CHECKING:
@@ -141,6 +141,17 @@ class ModelConfig:
                                     code_revision, rope_scaling, rope_theta)
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+
+        if (not self.disable_sliding_window
+                and self.hf_text_config.model_type == "gemma2"
+                and self.hf_text_config.sliding_window is not None):
+            print_warning_once(
+                "Gemma 2 uses sliding window attention for every odd layer, "
+                "which is currently not supported by vLLM. Disabling sliding "
+                "window and capping the max length to the sliding window size "
+                f"({self.hf_text_config.sliding_window}).")
+            self.disable_sliding_window = True
+
         self.max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             max_model_len=max_model_len,
@@ -257,8 +268,7 @@ class ModelConfig:
                 "BitAndBytes quantization with TP or PP is not supported yet.")
 
     def get_hf_config_sliding_window(self) -> Optional[int]:
-        """Get the sliding window size, or None if disabled.
-        """
+        """Get the sliding window size, or None if disabled."""
 
         # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
         # addition to sliding window size. We check if that field is present
@@ -1256,10 +1266,16 @@ def _get_and_verify_dtype(
         dtype = dtype.lower()
         if dtype == "auto":
             if config_dtype == torch.float32:
-                # Following the common practice, we use float16 for float32
-                # models.
-                logger.info("Casting torch.float32 to torch.float16.")
-                torch_dtype = torch.float16
+                if config.model_type == "gemma2":
+                    logger.info(
+                        "For Gemma 2, we downcast float32 to bfloat16 instead "
+                        "of float16 by default. Please specify `dtype` if you "
+                        "want to use float16.")
+                    torch_dtype = torch.bfloat16
+                else:
+                    # Following the common practice, we use float16 for float32
+                    # models.
+                    torch_dtype = torch.float16
             else:
                 torch_dtype = config_dtype
         else:
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e4a23273f..2fddfccaf 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1069,6 +1069,10 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
     def scale(self):
         return self.base_layer.scale
 
+    @property
+    def soft_cap(self):
+        return self.base_layer.soft_cap
+
     @property
     def org_vocab_size(self):
         return self.base_layer.org_vocab_size
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 14f5e2378..7a8699e39 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -95,3 +95,49 @@ class RMSNorm(CustomOp):
         s = f"hidden_size={self.weight.data.size(0)}"
         s += f", eps={self.variance_epsilon}"
         return s
+
+
+class GemmaRMSNorm(CustomOp):
+    """RMS normalization for Gemma.
+
+    Two differences from the above RMSNorm:
+        1. x * (1 + w) instead of x * w.
+        2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        orig_dtype = x.dtype
+        if residual is not None:
+            x = x + residual
+            residual = x
+
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        x = x * (1.0 + self.weight.float())
+        x = x.to(orig_dtype)
+        return x if residual is None else (x, residual)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        # TODO(woosuk): Implement an optimized kernel for GemmaRMSNorm.
+        return self.forward_native(x, residual)
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 7eee59947..8062bfb51 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -22,7 +22,8 @@ class LogitsProcessor(nn.Module):
                  vocab_size: int,
                  org_vocab_size: Optional[int] = None,
                  scale: float = 1.0,
-                 logits_as_input: bool = False) -> None:
+                 logits_as_input: bool = False,
+                 soft_cap: Optional[float] = None) -> None:
         """
         Args:
             scale: A scaling factor to apply to the logits.
@@ -34,6 +35,8 @@ class LogitsProcessor(nn.Module):
         self.logits_as_input = logits_as_input
         # original vocabulary size (without LoRA).
         self.org_vocab_size = org_vocab_size or vocab_size
+        # Soft cap the logits. Used in Gemma 2.
+        self.soft_cap = soft_cap
 
     def forward(
         self,
@@ -52,6 +55,11 @@ class LogitsProcessor(nn.Module):
             logits = self._get_logits(hidden_states, embedding, embedding_bias)
 
         if logits is not None:
+            if self.soft_cap is not None:
+                logits = logits / self.soft_cap
+                logits = torch.tanh(logits)
+                logits = logits * self.soft_cap
+
             if self.scale != 1.0:
                 logits *= self.scale
 
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index a0b19046b..9e53deef0 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -610,6 +610,16 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         return query.flatten(-2), key.flatten(-2)
 
 
+class GemmaRotaryEmbedding(RotaryEmbedding):
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/gemma/modeling_gemma.py#L107
+        inv_freq = 1.0 / (base**(
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.int64).float() /
+            self.rotary_dim))
+        return inv_freq
+
+
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 5afb2e1d4..e7ced618c 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -23,6 +23,7 @@ _GENERATION_MODELS = {
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
+    "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
new file mode 100644
index 000000000..4e35a9ec3
--- /dev/null
+++ b/vllm/model_executor/models/gemma2.py
@@ -0,0 +1,401 @@
+# coding=utf-8
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import Gemma2Config
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+from vllm.utils import print_warning_once
+
+from .interfaces import SupportsLoRA
+
+
+class Gemma2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
+            raise ValueError(
+                "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`.")
+        self.act_fn = GeluAndMul(approximate="tanh")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma2Attention(nn.Module):
+
+    def __init__(self,
+                 layer_idx: int,
+                 config: Gemma2Config,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 head_dim: int,
+                 max_position_embeddings: int,
+                 rope_theta: float,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        # TODO(woosuk): Use the `get_rope` interface.
+        self.rotary_emb = GemmaRotaryEmbedding(
+            self.head_dim,
+            self.head_dim,
+            max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),
+        )
+
+        if self.config.attn_logit_softcapping is not None:
+            print_warning_once(
+                "Gemma 2 normally uses attention logit soft-capping; "
+                "soft-capping is currently incompatible with the flash "
+                "attention kernels, so vLLM removes it to enable speed and "
+                "efficiency gains of flash attention.")
+        # FIXME(woosuk): While Gemma 2 uses sliding window attention for every
+        # odd layer, vLLM currently ignores it and uses global attention for
+        # all layers.
+        use_sliding_window = (layer_idx % 2 == 1
+                              and config.sliding_window is not None)
+        del use_sliding_window  # Unused.
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        layer_idx: int,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma2Attention(
+            layer_idx=layer_idx,
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states, residual = self.pre_feedforward_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        return hidden_states, residual
+
+
+class Gemma2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            Gemma2DecoderLayer(layer_idx, config, cache_config, quant_config)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer", torch.tensor(normalizer))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        hidden_states *= self.normalizer
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                attn_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: Gemma2Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        del lora_config  # Unused.
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma2Model(config, cache_config, quant_config)
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, soft_cap=config.final_logit_softcapping)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.model.embed_tokens.weight,
+                                       hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                "Some weights are not initialized from checkpoints: "
+                f"{unloaded_params}")
-- 
GitLab


From 64e8d2a783ac976f1b8e84a795f6a607820d6485 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 27 Jun 2024 13:34:55 -0700
Subject: [PATCH 177/376] [core][misc] remove logical block (#5882)

---
 vllm/block.py                 | 82 +----------------------------------
 vllm/core/block_manager_v1.py | 19 ++++----
 vllm/sequence.py              | 35 +++------------
 3 files changed, 16 insertions(+), 120 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index bd00c07ad..0b8ef7d4b 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -1,90 +1,10 @@
 """Token blocks."""
-import weakref
-from collections import defaultdict
-from typing import Dict, List
+from typing import List
 
 from vllm.utils import Device
 
-_BLANK_TOKEN_ID = -1
-
 DEFAULT_LAST_ACCESSED_TIME = -1
 
-TokensBlock = List[int]
-
-
-class BlockPool:
-    """A pool of logical blocks.
-    When requests come, we create a lot of logical blocks;
-    when requests are done, we destroy a lot of logical blocks.
-    It turns out that creating and destroying logical blocks can be expensive,
-    especially for the `token_ids` field, which is a list of integers.
-    To avoid this overhead, we use a pool to manage the logical blocks.
-    When an old request is done and a new request comes, we can reuse the
-    logical blocks from the old request to feed the new request.
-    """
-
-    def __init__(self) -> None:
-        # block size to list of token blocks
-        self.pool: Dict[int, List[TokensBlock]] = defaultdict(list)
-
-    def alloc_block(self, block_size: int) -> TokensBlock:
-        if block_size in self.pool and self.pool[block_size]:
-            return self.pool[block_size].pop()
-        return [_BLANK_TOKEN_ID] * block_size
-
-    def del_block(self, block: TokensBlock) -> None:
-        self.pool[len(block)].append(block)
-
-
-_BLOCK_POOL = BlockPool()
-
-
-class LogicalTokenBlock:
-    """A block that stores a contiguous chunk of tokens from left to right.
-
-    Logical blocks are used to represent the states of the corresponding
-    physical blocks in the KV cache.
-    """
-
-    def __init__(
-        self,
-        block_number: int,
-        block_size: int,
-    ) -> None:
-        self.block_number = block_number
-        self.block_size = block_size
-
-        self.token_ids = _BLOCK_POOL.alloc_block(block_size)
-        # this finalizer is used to return the block to the pool when the object is deleted # noqa
-        # NOTE: don't use __del__ because it cannot guarantee the order of finalization, # noqa
-        # i.e. `self.token_ids` may be deleted before `self`, and we lose
-        #  the opportunity to return the block to the pool
-        self._finalizer = weakref.finalize(self, _BLOCK_POOL.del_block,
-                                           self.token_ids)
-        self.num_tokens = 0
-
-    def is_empty(self) -> bool:
-        return self.num_tokens == 0
-
-    def get_num_empty_slots(self) -> int:
-        return self.block_size - self.num_tokens
-
-    def is_full(self) -> bool:
-        return self.num_tokens == self.block_size
-
-    def append_tokens(self, token_ids: List[int]) -> None:
-        assert len(token_ids) <= self.get_num_empty_slots()
-        curr_idx = self.num_tokens
-        self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
-        self.num_tokens += len(token_ids)
-
-    def get_token_ids(self) -> List[int]:
-        return self.token_ids[:self.num_tokens]
-
-    def get_last_token_id(self) -> int:
-        assert self.num_tokens > 0
-        return self.token_ids[self.num_tokens - 1]
-
 
 class PhysicalTokenBlock:
     """Represents the state of a block in the KV cache."""
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 4010aaf02..995ea04a5 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -262,8 +262,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
         self.cross_block_tables: Dict[str, BlockTable] = {}
 
     def _get_seq_num_required_blocks(self, seq: Sequence) -> int:
-        return 0 if seq is None \
-            else len(seq.logical_token_blocks)
+        return 0 if seq is None else seq.n_blocks
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
@@ -298,7 +297,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
                            ref_count: int, \
                            is_encoder_decoder: bool = True) -> BlockTable:
         # Allocate new physical token blocks that will store the prompt tokens.
-        num_prompt_blocks = len(seq.logical_token_blocks)
+        num_prompt_blocks = seq.n_blocks
 
         block_table: BlockTable = []
         for logical_idx in range(num_prompt_blocks):
@@ -367,7 +366,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
 
         # Compute a new hash for the block so that it can be shared by other
         # Sequences
-        new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
+        new_hash = seq.hash_of_block(seq.n_blocks - 1)
 
         # if new_hash is already in the cached table, then free last_block
         # and return the cached version
@@ -407,10 +406,10 @@ class BlockSpaceManagerV1(BlockSpaceManager):
         if not self.enable_caching:
             return self.gpu_allocator.allocate()
         block_hash: Optional[int] = None
+        n_blocks = seq.n_blocks
         if (self._is_last_block_full(seq)):
-            block_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
-        num_hashed_tokens = seq.num_hashed_tokens_of_block(
-            len(seq.logical_token_blocks) - 1)
+            block_hash = seq.hash_of_block(n_blocks - 1)
+        num_hashed_tokens = seq.num_hashed_tokens_of_block(n_blocks - 1)
 
         # num_hashed_tokens is used to compute future hashes
         # (e.g. in the hashing function, it is used to ask the sequence for
@@ -429,12 +428,12 @@ class BlockSpaceManagerV1(BlockSpaceManager):
         num_lookahead_slots: int = 0,
     ) -> List[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
-        logical_blocks = seq.logical_token_blocks
+        n_blocks = seq.n_blocks
         block_table = self.block_tables[seq.seq_id]
         # If we need to allocate a new physical block
-        if len(block_table) < len(logical_blocks):
+        if len(block_table) < n_blocks:
             # Currently this code only supports adding one physical block
-            assert len(block_table) == len(logical_blocks) - 1
+            assert len(block_table) == n_blocks - 1
 
             if (self.block_sliding_window
                     and len(block_table) >= self.block_sliding_window):
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0925d1546..c618c3692 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,13 +1,13 @@
 """Sequence and its related classes."""
 import copy
 import enum
+import math
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
-from vllm.block import LogicalTokenBlock
 from vllm.inputs import LLMInputs
 from vllm.lora.request import LoRARequest
 from vllm.pooling_params import PoolingParams
@@ -236,9 +236,6 @@ class Sequence:
         self.output_logprobs: SampleLogprobs = []
         self.output_text = ""
 
-        self.logical_token_blocks: List[LogicalTokenBlock] = []
-        # Initialize the logical token blocks with the prompt token ids.
-        self._append_tokens_to_blocks(self.prompt_token_ids)
         self.status = SequenceStatus.WAITING
         self.stop_reason: Union[int, str, None] = None
 
@@ -248,6 +245,10 @@ class Sequence:
         # Input + output tokens
         self.tokens: Optional[List[str]] = None
 
+    @property
+    def n_blocks(self) -> int:
+        return math.ceil(self.get_len() / self.block_size)
+
     @property
     def prompt(self) -> Optional[str]:
         return self.inputs.get("prompt")
@@ -287,36 +288,12 @@ class Sequence:
         """Reset the sequence states for recomputation."""
         self.data.reset_state_for_recompute()
 
-    def _append_logical_block(self) -> None:
-        block = LogicalTokenBlock(
-            block_number=len(self.logical_token_blocks),
-            block_size=self.block_size,
-        )
-        self.logical_token_blocks.append(block)
-
-    def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
-        cursor = 0
-        while cursor < len(token_ids):
-            if not self.logical_token_blocks:
-                self._append_logical_block()
-
-            last_block = self.logical_token_blocks[-1]
-            if last_block.is_full():
-                self._append_logical_block()
-                last_block = self.logical_token_blocks[-1]
-
-            num_empty_slots = last_block.get_num_empty_slots()
-            last_block.append_tokens(token_ids[cursor:cursor +
-                                               num_empty_slots])
-            cursor += num_empty_slots
-
     def append_token_id(
         self,
         token_id: int,
         logprobs: Dict[int, Logprob],
     ) -> None:
         assert token_id in logprobs
-        self._append_tokens_to_blocks([token_id])
         self.output_logprobs.append(logprobs)
         self.data.append_token_id(token_id, logprobs[token_id].logprob)
 
@@ -388,7 +365,7 @@ class Sequence:
     def __repr__(self) -> str:
         return (f"Sequence(seq_id={self.seq_id}, "
                 f"status={self.status.name}, "
-                f"num_blocks={len(self.logical_token_blocks)})")
+                f"num_blocks={self.n_blocks}, ")
 
 
 @dataclass
-- 
GitLab


From c3dde367f16111b8968948a1f8e1a26bdac6ffdd Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:41:08 -0500
Subject: [PATCH 178/376] [Kernel][ROCm][AMD] fused_moe Triton configs v2 for
 mi300X (#5932)

---
 ...14336,device_name=AMD_Instinct_MI300X.json | 164 +++++++++++-----
 ...=1792,device_name=AMD_Instinct_MI300X.json | 182 +++++++++++++-----
 ...=3584,device_name=AMD_Instinct_MI300X.json | 172 ++++++++++++-----
 ...=7168,device_name=AMD_Instinct_MI300X.json | 176 ++++++++++++-----
 4 files changed, 500 insertions(+), 194 deletions(-)
 mode change 100644 => 100755 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 93472eb08..6a976788f 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
-        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
old mode 100644
new mode 100755
index 5bd9d71e8..0a46390b2
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -1,110 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 32
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
-        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index 02e66280c..91011e64c 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 1
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 0
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "512": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index 34c3b593d..f807d4a5a 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -1,128 +1,200 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
         "GROUP_SIZE_M": 1,
-        "num_stages": 1
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_stages": 0
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "64": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 1
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "256": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
     },
     "512": {
-        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 8,
-        "num_stages": 0
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
     },
     "4096": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_stages": 0
+        "num_warps": 8,
+        "num_stages": 0,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
     }
 }
-- 
GitLab


From f136da15e154b25c7eb3221772a85a15811f0318 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Jun 2024 21:12:13 -0700
Subject: [PATCH 179/376] [Hardware][TPU] Optimize KV cache swapping (#5878)

---
 vllm/attention/backends/pallas.py | 16 +++---------
 vllm/worker/tpu_worker.py         | 42 +++++++++++++++++++++++--------
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 5dec11e2e..22cb1a1bd 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -28,21 +28,13 @@ class PallasAttentionBackend(AttentionBackend):
     ) -> Tuple[int, ...]:
         return (num_kv_heads, num_blocks, block_size, head_size)
 
-    @torch.compile(backend="openxla")
     @staticmethod
     def swap_blocks(
-        src_kv_cache: Tuple[torch.Tensor, torch.Tensor],
-        dst_kv_cache: Tuple[torch.Tensor, torch.Tensor],
-        src_to_dst: Tuple[torch.Tensor, torch.Tensor],
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
     ) -> None:
-        src_k_cache, src_v_cache = src_kv_cache
-        dst_k_cache, dst_v_cache = dst_kv_cache
-        src_indices, dst_indices = src_to_dst
-        device = dst_k_cache.device
-        torch.ops.xla.dynamo_set_buffer_donor_(dst_k_cache, True)
-        torch.ops.xla.dynamo_set_buffer_donor_(dst_v_cache, True)
-        dst_k_cache[:, dst_indices] = src_k_cache[:, src_indices].to(device)
-        dst_v_cache[:, dst_indices] = src_v_cache[:, src_indices].to(device)
+        raise RuntimeError("swap_blocks is not used for the TPU backend.")
 
     @torch.compile(backend="openxla")
     @staticmethod
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 28f460c31..37d810e83 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -3,6 +3,7 @@ from typing import List, Optional, Tuple, Union
 
 import torch
 import torch_xla.core.xla_model as xm
+import torch_xla.experimental.dynamo_set_buffer_donor  # noqa: F401
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
@@ -152,8 +153,8 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
 
-        self.cpu_cache = []
-        self.tpu_cache = []
+        self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
+        self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
         tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
             num_gpu_blocks, self.block_size, num_kv_heads, head_size)
         cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
@@ -227,18 +228,25 @@ class TPUWorker(LoraNotSupportedWorkerBase):
 
         if blocks_to_swap_in:
             # Swap from CPU to TPU.
-            src_to_dst = _make_src_to_dst(blocks_to_swap_in, "cpu",
-                                          self.device)
+            src_indices, dst_indices = _make_src_to_dst(
+                blocks_to_swap_in, "cpu", self.device)
             for i in range(num_layers):
-                attn_backend.swap_blocks(self.cpu_cache[i], self.tpu_cache[i],
-                                         src_to_dst)
+                tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
+                cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
+                k = cpu_k_cache[:, src_indices].to(self.device)
+                v = cpu_v_cache[:, src_indices].to(self.device)
+                _insert_kv(k, v, dst_indices, tpu_k_cache, tpu_v_cache)
+
         if blocks_to_swap_out:
             # Swap from TPU to CPU.
-            src_to_dst = _make_src_to_dst(blocks_to_swap_out, self.device,
-                                          "cpu")
+            src_indices, dst_indices = _make_src_to_dst(
+                blocks_to_swap_out, self.device, "cpu")
             for i in range(num_layers):
-                attn_backend.swap_blocks(self.tpu_cache[i], self.cpu_cache[i],
-                                         src_to_dst)
+                tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
+                cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
+                cpu_k_cache[:, dst_indices] = tpu_k_cache[:, src_indices].cpu()
+                cpu_v_cache[:, dst_indices] = tpu_v_cache[:, src_indices].cpu()
+
         if blocks_to_copy:
             src_to_dst = _make_src_to_dst(blocks_to_copy, self.device,
                                           self.device)
@@ -267,3 +275,17 @@ def _make_src_to_dst(
                                device=dst_device,
                                dtype=torch.int64)
     return src_indices, dst_indices
+
+
+@torch.compile(backend="openxla")
+def _insert_kv(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    indices: torch.Tensor,
+    tpu_k_cache: torch.Tensor,
+    tpu_v_cache: torch.Tensor,
+) -> None:
+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_k_cache, True)
+    torch.ops.xla.dynamo_set_buffer_donor_(tpu_v_cache, True)
+    tpu_k_cache[:, indices] = k
+    tpu_v_cache[:, indices] = v
-- 
GitLab


From 74d55c065b104f816fca9c177e044415802796a1 Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Fri, 28 Jun 2024 00:29:13 -0700
Subject: [PATCH 180/376] [VLM][BugFix] Make sure that `multi_modal_kwargs` can
 broadcast properly with ring buffer. (#5905)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 vllm/distributed/parallel_state.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 1f6b05e86..51616cb0f 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -45,7 +45,7 @@ TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
 
 
 def _split_tensor_dict(
-        tensor_dict: Dict[Any, Union[torch.Tensor, Any]],
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
         prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
@@ -473,11 +473,11 @@ class GroupCoordinator:
 
     def broadcast_tensor_dict(
         self,
-        tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
+        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
         src: int = 0,
         group: Optional[ProcessGroup] = None,
         metadata_group: Optional[ProcessGroup] = None
-    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Broadcast the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
         """
@@ -558,9 +558,9 @@ class GroupCoordinator:
 
     def send_tensor_dict(
         self,
-        tensor_dict: Dict[Any, Union[torch.Tensor, Any]],
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
         dst: Optional[int] = None
-    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Send the input tensor dictionary.
         NOTE: `dst` is the local rank of the source rank.
         """
@@ -599,7 +599,7 @@ class GroupCoordinator:
     def recv_tensor_dict(
         self,
         src: Optional[int] = None
-    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Recv the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
         """
@@ -615,7 +615,7 @@ class GroupCoordinator:
         assert src < self.world_size, f"Invalid src rank ({src})"
 
         recv_metadata_list = self.recv_object(src=src)
-        tensor_dict = {}
+        tensor_dict: Dict[str, Any] = {}
         for key, value in recv_metadata_list:
             if isinstance(value, TensorMetadata):
                 tensor = torch.empty(value.size,
@@ -623,7 +623,7 @@ class GroupCoordinator:
                                      device=value.device)
                 if tensor.numel() == 0:
                     # Skip broadcasting empty tensors.
-                    tensor_dict[key] = tensor
+                    _update_nested_dict(tensor_dict, key, tensor)
                     continue
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
@@ -633,9 +633,9 @@ class GroupCoordinator:
                 else:
                     # use group for GPU tensors
                     torch.distributed.recv(tensor, src=src, group=group)
-                tensor_dict[key] = tensor
+                _update_nested_dict(tensor_dict, key, tensor)
             else:
-                tensor_dict[key] = value
+                _update_nested_dict(tensor_dict, key, value)
         return tensor_dict
 
     def barrier(self):
-- 
GitLab


From 0d0e3a42ac80eb41fc50f139fe31fde8e0b5bf8e Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Fri, 28 Jun 2024 20:03:41 +0800
Subject: [PATCH 181/376] [Bugfix][Hardware][Intel CPU] Fix unpassed
 multi_modal_kwargs for CPU runner (#5956)

---
 vllm/worker/cpu_model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index e3464c0d3..148332f34 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -355,6 +355,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
             input_positions=input_positions,
             attn_metadata=attn_metadata,
             sampling_metadata=sampling_metadata,
+            multi_modal_kwargs=multi_modal_kwargs,
         )
 
     @torch.inference_mode()
-- 
GitLab


From 5cbe8d155c33a75934782019c06860467a3efa97 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 20:09:56 +0800
Subject: [PATCH 182/376] [Core] Registry for processing model inputs (#5214)

Co-authored-by: ywang96 <ywang@roblox.com>
---
 .../input_processing_pipeline.rst             |  20 ++
 .../input_processing/model_inputs_index.rst   |  39 ++++
 .../dev/multimodal/multimodal_index.rst       |   8 +-
 docs/source/index.rst                         |   1 +
 docs/source/models/adding_model.rst           |   4 +-
 examples/phi3v_example.py                     |   3 +-
 .../{test_processor.py => test_mapper.py}     |  69 +++---
 vllm/config.py                                |   3 +
 vllm/engine/arg_utils.py                      |  64 +++---
 vllm/engine/async_llm_engine.py               |   8 +-
 vllm/engine/llm_engine.py                     |  13 +-
 vllm/inputs/__init__.py                       |  19 ++
 vllm/{inputs.py => inputs/data.py}            |  18 +-
 vllm/inputs/registry.py                       | 207 ++++++++++++++++++
 vllm/model_executor/models/clip.py            |  77 ++++++-
 vllm/model_executor/models/llava.py           |  40 +++-
 vllm/model_executor/models/llava_next.py      | 138 +++++++++---
 vllm/model_executor/models/phi3v.py           |  74 +++++--
 vllm/multimodal/__init__.py                   |  11 +-
 vllm/multimodal/base.py                       |  84 ++++---
 vllm/multimodal/image.py                      |  78 ++-----
 vllm/multimodal/registry.py                   | 133 ++++-------
 vllm/sequence.py                              |   4 +-
 vllm/transformers_utils/image_processor.py    |   4 -
 vllm/worker/cpu_model_runner.py               |  20 +-
 vllm/worker/model_runner.py                   |  31 +--
 26 files changed, 778 insertions(+), 392 deletions(-)
 create mode 100644 docs/source/dev/input_processing/input_processing_pipeline.rst
 create mode 100644 docs/source/dev/input_processing/model_inputs_index.rst
 rename tests/multimodal/{test_processor.py => test_mapper.py} (71%)
 create mode 100644 vllm/inputs/__init__.py
 rename vllm/{inputs.py => inputs/data.py} (90%)
 create mode 100644 vllm/inputs/registry.py

diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
new file mode 100644
index 000000000..e0c773781
--- /dev/null
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -0,0 +1,20 @@
+.. _input_processing_pipeline:
+
+Input Processing Pipeline
+=========================
+
+1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
+
+2. Tokenize the data if necessary.
+
+3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
+4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
+
+5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
+6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+
+   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.
diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
new file mode 100644
index 000000000..594edeb74
--- /dev/null
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -0,0 +1,39 @@
+.. _input_processing:
+
+Input Processing
+================
+
+.. currentmodule:: vllm.inputs
+
+vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
+in :class:`~vllm.LLMEngine` before they are passed to model executors. 
+
+Currently, this mechanism is only utilized in **multi-modal models** for preprocessing multi-modal input 
+data in addition to input prompt, but it can be extended to text-only language models when needed.
+
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   input_processing_pipeline
+
+Module Contents
++++++++++++++++
+
+LLM Engine Inputs
+-----------------
+
+.. autoclass:: vllm.inputs.LLMInputs
+    :members:
+    :show-inheritance:
+
+Registry
+--------
+
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index a25eceecc..f6fdfc1de 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -12,10 +12,6 @@ By default, vLLM models do not support multi-modal inputs. To enable multi-modal
 you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
 as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
 
-.. contents::
-   :local:
-   :backlinks: none
-
 Module Contents
 +++++++++++++++
 
@@ -24,9 +20,7 @@ Module Contents
 Registry
 --------
 
-.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
-
-    The global :class:`MultiModalRegistry` which is used by model runners.
+.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 
 .. autoclass:: vllm.multimodal.MultiModalRegistry
     :members:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 05133eb6d..3a9f5a3d8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -120,6 +120,7 @@ Documentation
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/input_processing/model_inputs_index
    dev/multimodal/multimodal_index
    dev/dockerfile/dockerfile
 
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index cbc8099e6..f282b5945 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -37,7 +37,7 @@ For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/
 2. Rewrite the :code:`forward` methods
 --------------------------------------
 
-Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
+Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps:
 
 1. Remove any unnecessary code, such as the code only used for training.
 2. Change the input parameters:
@@ -75,7 +75,7 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
 
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
+For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index c068b9a98..46b7be5cd 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -11,7 +11,7 @@ def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
 
     # Note: The model has 128k context length by default which may cause OOM
-    # If that's the case, override `max_model_len` with a smaller value via args
+    # In this example, we override max_model_len to 2048.
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
@@ -19,6 +19,7 @@ def run_phi3v():
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
+        max_model_len=2048,
     )
 
     image = Image.open("images/cherry_blossom.jpg")
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_mapper.py
similarity index 71%
rename from tests/multimodal/test_processor.py
rename to tests/multimodal/test_mapper.py
index 9ac48dfab..2c05b0edb 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_mapper.py
@@ -25,14 +25,14 @@ def test_clip_image_processor(image_assets, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=32000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=576,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ),
     )
 
     for asset in image_assets:
@@ -40,10 +40,9 @@ def test_clip_image_processor(image_assets, dtype):
             asset.pil_image,
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
-        vllm_result = MULTIMODAL_REGISTRY.process_input(
+        vllm_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
             ImagePixelData(asset.pil_image),
-            model_config=model_config,
-            vlm_config=vlm_config,
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -74,14 +73,14 @@ def test_llava_next_image_processor(image_assets, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=64000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=2928,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=64000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=2928,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ),
     )
 
     for asset in image_assets:
@@ -89,10 +88,9 @@ def test_llava_next_image_processor(image_assets, dtype):
             asset.pil_image,
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
-        vllm_result = MULTIMODAL_REGISTRY.process_input(
+        vllm_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
             ImagePixelData(asset.pil_image),
-            model_config=model_config,
-            vlm_config=vlm_config,
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -119,26 +117,23 @@ def test_image_pixel_types(image_assets, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=32000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=576,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ))
 
     for asset in image_assets:
-        image_result = MULTIMODAL_REGISTRY.process_input(
+        image_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
             ImagePixelData(asset.pil_image),
-            model_config=model_config,
-            vlm_config=vlm_config,
         )
-        tensor_result = MULTIMODAL_REGISTRY.process_input(
+        tensor_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
             ImagePixelData(asset.pixel_values),
-            model_config=model_config,
-            vlm_config=vlm_config,
         )
 
         assert image_result.keys() == tensor_result.keys()
diff --git a/vllm/config.py b/vllm/config.py
index 9a98a7fbc..6adeaf420 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -109,6 +109,7 @@ class ModelConfig:
         disable_sliding_window: bool = False,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, List[str]]] = None,
+        multimodal_config: Optional["VisionLanguageConfig"] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -159,6 +160,8 @@ class ModelConfig:
             sliding_window_len=self.get_hf_config_sliding_window())
         self.served_model_name = get_served_model_name(model,
                                                        served_model_name)
+        self.multimodal_config = multimodal_config
+
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
         self._verify_embedding_mode()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 16374098b..c392155e8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -643,6 +643,36 @@ class EngineArgs:
             raise ValueError(
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
+        if self.image_input_type:
+            if (not self.image_token_id or not self.image_input_shape
+                    or not self.image_feature_size):
+                raise ValueError(
+                    'Specify `image_token_id`, `image_input_shape` and '
+                    '`image_feature_size` together with `image_input_type`.')
+
+            if self.image_processor is None:
+                self.image_processor = self.model
+            if self.disable_image_processor:
+                if self.image_processor != self.model:
+                    warnings.warn(
+                        "You've specified an image processor "
+                        f"({self.image_processor}) but also disabled "
+                        "it via `--disable-image-processor`.",
+                        stacklevel=2)
+
+                self.image_processor = None
+
+            vision_language_config = VisionLanguageConfig(
+                image_input_type=VisionLanguageConfig.
+                get_image_input_enum_type(self.image_input_type),
+                image_token_id=self.image_token_id,
+                image_input_shape=str_to_int_tuple(self.image_input_shape),
+                image_feature_size=self.image_feature_size,
+                image_processor=self.image_processor,
+                image_processor_revision=self.image_processor_revision,
+            )
+        else:
+            vision_language_config = None
 
         device_config = DeviceConfig(device=self.device)
         model_config = ModelConfig(
@@ -666,7 +696,8 @@ class EngineArgs:
             max_logprobs=self.max_logprobs,
             disable_sliding_window=self.disable_sliding_window,
             skip_tokenizer_init=self.skip_tokenizer_init,
-            served_model_name=self.served_model_name)
+            served_model_name=self.served_model_name,
+            multimodal_config=vision_language_config)
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
@@ -742,37 +773,6 @@ class EngineArgs:
             model_loader_extra_config=self.model_loader_extra_config,
         )
 
-        if self.image_input_type:
-            if (not self.image_token_id or not self.image_input_shape
-                    or not self.image_feature_size):
-                raise ValueError(
-                    'Specify `image_token_id`, `image_input_shape` and '
-                    '`image_feature_size` together with `image_input_type`.')
-
-            if self.image_processor is None:
-                self.image_processor = self.model
-            if self.disable_image_processor:
-                if self.image_processor != self.model:
-                    warnings.warn(
-                        "You've specified an image processor "
-                        f"({self.image_processor}) but also disabled "
-                        "it via `--disable-image-processor`.",
-                        stacklevel=2)
-
-                self.image_processor = None
-
-            vision_language_config = VisionLanguageConfig(
-                image_input_type=VisionLanguageConfig.
-                get_image_input_enum_type(self.image_input_type),
-                image_token_id=self.image_token_id,
-                image_input_shape=str_to_int_tuple(self.image_input_shape),
-                image_feature_size=self.image_feature_size,
-                image_processor=self.image_processor,
-                image_processor_revision=self.image_processor_revision,
-            )
-        else:
-            vision_language_config = None
-
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7994b873f..848e05f03 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -278,9 +278,11 @@ class _AsyncLLMEngine(LLMEngine):
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=inputs.get("prompt"),
-                         multi_modal_data=inputs.get("multi_modal_data"))
+        llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
+                               prompt=inputs.get("prompt"),
+                               multi_modal_data=inputs.get("multi_modal_data"))
+
+        return self.input_processor(llm_inputs)
 
     async def add_request_async(
         self,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4b427b1fb..9b720d613 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -20,7 +20,7 @@ from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import LLMInputs, PromptInputs
+from vllm.inputs import INPUT_REGISTRY, LLMInputs, PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
@@ -227,6 +227,9 @@ class LLMEngine:
         self.generation_config_fields = _load_generation_config_dict(
             model_config)
 
+        self.input_processor = INPUT_REGISTRY.create_input_processor(
+            self.model_config)
+
         self.model_executor = executor_class(
             model_config=model_config,
             cache_config=cache_config,
@@ -511,9 +514,11 @@ class LLMEngine:
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=inputs.get("prompt"),
-                         multi_modal_data=inputs.get("multi_modal_data"))
+        llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
+                               prompt=inputs.get("prompt"),
+                               multi_modal_data=inputs.get("multi_modal_data"))
+
+        return self.input_processor(llm_inputs)
 
     def add_request(
         self,
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
new file mode 100644
index 000000000..d09415696
--- /dev/null
+++ b/vllm/inputs/__init__.py
@@ -0,0 +1,19 @@
+from .data import (LLMInputs, ParsedText, ParsedTokens, PromptInputs,
+                   PromptStrictInputs, TextPrompt, TextTokensPrompt,
+                   TokensPrompt, parse_and_batch_prompt)
+from .registry import InputContext, InputRegistry
+
+INPUT_REGISTRY = InputRegistry()
+"""
+The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
+to dispatch data processing according to the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
+"""
+
+__all__ = [
+    "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt",
+    "TokensPrompt", "TextTokensPrompt", "PromptStrictInputs", "PromptInputs",
+    "LLMInputs", "INPUT_REGISTRY", "InputContext", "InputRegistry"
+]
diff --git a/vllm/inputs.py b/vllm/inputs/data.py
similarity index 90%
rename from vllm/inputs.py
rename to vllm/inputs/data.py
index 026903e19..9b163b9cf 100644
--- a/vllm/inputs.py
+++ b/vllm/inputs/data.py
@@ -101,8 +101,7 @@ class TextTokensPrompt(TypedDict):
     """The prompt text."""
 
     prompt_token_ids: List[int]
-    """The token IDs of the prompt. If None, we use the
-    tokenizer to convert the prompts to token IDs."""
+    """The token IDs of the prompt."""
 
     multi_modal_data: NotRequired["MultiModalData"]
     """
@@ -125,6 +124,21 @@ PromptInputs = Union[str, TextPrompt, TokensPrompt, TextTokensPrompt]
 
 
 class LLMInputs(TypedDict):
+    """
+    The inputs in :class:`~vllm.LLMEngine` before they are
+    passed to the model executor.
+    """
+
     prompt_token_ids: List[int]
+    """The token IDs of the prompt."""
+
     prompt: NotRequired[Optional[str]]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
     multi_modal_data: NotRequired[Optional["MultiModalData"]]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
new file mode 100644
index 000000000..8f4e108b8
--- /dev/null
+++ b/vllm/inputs/registry.py
@@ -0,0 +1,207 @@
+import functools
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
+                    TypeVar)
+
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.logger import init_logger
+
+from .data import LLMInputs
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VisionLanguageConfig
+    from vllm.multimodal import MultiModalData
+    from vllm.sequence import SequenceData
+
+logger = init_logger(__name__)
+
+C = TypeVar("C", bound=PretrainedConfig)
+
+
+@dataclass(frozen=True)
+class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+
+    model_config: "ModelConfig"
+    """The configuration of the model."""
+
+    def get_multimodal_config(self) -> "VisionLanguageConfig":
+        """
+        Get the multimodal configuration of the model.
+
+        Raises:
+            ValueError: If the model is not multimodal.
+        """
+
+        multimodal_config = self.model_config.multimodal_config
+        if multimodal_config is None:
+            raise ValueError("No multimodal config found")
+
+        return multimodal_config
+
+    def get_hf_config(self, hf_config_type: Type[C]) -> C:
+        """
+        Get the HuggingFace configuration
+        (:class:`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            ValueError: If the model is not of the specified type.
+        """
+
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, hf_config_type):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {hf_config_type}, but "
+                            f"found type: {type(hf_config)}")
+
+        return hf_config
+
+
+N = TypeVar("N", bound=Type[nn.Module])
+
+DummyDataFactory = Callable[[InputContext, int],
+                            Tuple["SequenceData", Optional["MultiModalData"]]]
+"""
+Create dummy data to be inputted into the model.
+
+Note:
+    :data:`InputProcessor` is not applied to the dummy data.
+"""
+
+InputProcessor = Callable[[InputContext, LLMInputs], LLMInputs]
+"""Preprocess the inputs to the model."""
+
+
+class InputRegistry:
+    """
+    A registry to dispatch data processing
+    according to the target model.
+    """
+
+    def __init__(self) -> None:
+        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
+                                                  DummyDataFactory] = {}
+        self._input_processors_by_model_type: Dict[Type[nn.Module],
+                                                   InputProcessor] = {}
+
+    def _default_dummy_data_factory(
+        self,
+        ctx: InputContext,
+        seq_len: int,
+    ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
+        """
+        The default dummy data factory represents the longest possible text
+        that can be inputted to the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+        """
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        dummy_seq_data = SequenceData([0] * seq_len)
+        dummy_multi_modal_data = None
+
+        return dummy_seq_data, dummy_multi_modal_data
+
+    def register_dummy_data(self, factory: DummyDataFactory):
+        """
+        Register a dummy data factory to a model class.
+
+        During memory profiling, the provided function is invoked to create
+        dummy data to be inputted into the model. The resulting memory usage
+        should be an upper bound of what the model would use at inference time.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._dummy_factories_by_model_type:
+                logger.warning(
+                    "Model class %s already has dummy data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def dummy_data_for_profiling(self, model_config: "ModelConfig",
+                                 seq_len: int):
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+
+        TODO: Add guide [ref: PR #5276]
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        dummy_factory = self._dummy_factories_by_model_type \
+            .get(model_cls, self._default_dummy_data_factory)
+
+        return dummy_factory(InputContext(model_config), seq_len)
+
+    def _default_input_processor(self, ctx: InputContext,
+                                 inputs: LLMInputs) -> LLMInputs:
+        """The default input processor is a no-op."""
+        return inputs
+
+    def register_input_processor(self, processor: InputProcessor):
+        """
+        Register an input processor to a model class.
+
+        The provided function is invoked on each input to the model. This
+        happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
+
+        See also:
+            :ref:`input_processing_pipeline`
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._input_processors_by_model_type:
+                logger.warning(
+                    "Model class %s already has input processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._input_processors_by_model_type[model_cls] = processor
+
+            return model_cls
+
+        return wrapper
+
+    def process_input(self, model_config: "ModelConfig",
+                      inputs: LLMInputs) -> LLMInputs:
+        """
+        Apply an input processor to an instance of model inputs.
+
+        The model is identified by ``model_config``.
+
+        See also:
+            :ref:`input_processing_pipeline`
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+
+        processor = self._input_processors_by_model_type \
+            .get(model_cls, self._default_input_processor)
+
+        return processor(InputContext(model_config), inputs)
+
+    def create_input_processor(self, model_config: "ModelConfig"):
+        """
+        Create an input processor (see :meth:`process_input`) for a
+        specific model.
+        """
+        return functools.partial(self.process_input, model_config)
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index aa4e87228..77fbade05 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,22 +1,83 @@
 """Minimal implementation of CLIPVisionModel intended to be only used 
 within a vision language model."""
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.sequence import SequenceData
 
 
-def get_clip_num_patches(image_size: int, patch_size: int) -> int:
+def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
-    return (image_size // patch_size)**2
+    return image_size // patch_size
+
+
+def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
+    grid_length = get_clip_patch_grid_length(image_size=image_size,
+                                             patch_size=patch_size)
+    return grid_length * grid_length
+
+
+def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
+    return get_clip_num_patches(image_size=hf_config.image_size,
+                                patch_size=hf_config.patch_size)
+
+
+def dummy_seq_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_pixel_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return ImagePixelData(image)
+
+
+def dummy_feature_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    *,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    values = torch.zeros((1, image_feature_size, hf_config.hidden_size),
+                         dtype=torch.float16)
+    return ImageFeatureData(values)
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
@@ -39,8 +100,8 @@ class CLIPVisionEmbeddings(nn.Module):
             bias=False,
         )
 
-        self.num_patches = get_clip_num_patches(self.image_size,
-                                                self.patch_size)
+        self.num_patches = get_clip_num_patches(image_size=self.image_size,
+                                                patch_size=self.patch_size)
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions,
                                                self.embed_dim)
@@ -101,7 +162,7 @@ class CLIPEncoderLayer(nn.Module):
         self.layer_norm2 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
 
-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         residual = hidden_states
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 125e3ddea..bdcb63317 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -2,10 +2,11 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
-from transformers import LlavaConfig
+from transformers import CLIPVisionConfig, LlavaConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -16,10 +17,11 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
 from vllm.sequence import SamplerOutput
 
+from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
+                   dummy_seq_data_for_clip)
 from .interfaces import SupportsVision
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -83,9 +85,35 @@ class LlavaImageFeatureInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 
 
-@MULTIMODAL_REGISTRY.register_image_feature_input()
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
+def dummy_data_for_llava(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+        )
+
+        image_input_type = multimodal_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+        mm_data: MultiModalData
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            mm_data = dummy_pixel_data_for_clip(vision_config)
+        elif image_input_type == ImageInputType.IMAGE_FEATURES:
+            mm_data = dummy_feature_data_for_clip(vision_config)
+
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+@MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
 class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 841818d8d..cebc82816 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -3,14 +3,14 @@ from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict,
 
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import LlavaNextConfig
+from transformers import CLIPVisionConfig, LlavaNextConfig
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -22,9 +22,11 @@ from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
-from vllm.sequence import SamplerOutput, SequenceData
+from vllm.multimodal.image import ImagePixelData
+from vllm.sequence import SamplerOutput
 
+from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
+                   dummy_seq_data_for_clip, get_clip_patch_grid_length)
 from .interfaces import SupportsVision
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
 
@@ -58,41 +60,118 @@ LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
                              LlavaNextImageFeatureInputs]
 
 
-def _get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    seq_data, fake_mm_data = get_dummy_image_data(seq_len, model_config,
-                                                  vlm_config)
+def _get_llava_next_num_unpadded_features(
+    height: int,
+    width: int,
+    npatches: int,
+    num_patch_height: int,
+    num_patch_width: int,
+) -> Tuple[int, int]:
+    # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
+    current_height = npatches * num_patch_height
+    current_width = npatches * num_patch_width
+
+    aspect_ratio: float = width / height
+    current_aspect_ratio: float = current_width / current_height
+    if aspect_ratio > current_aspect_ratio:
+        new_height = (height * current_width) // width
+        current_height = new_height
+    else:
+        new_width = (width * current_height) // height
+        current_width = new_width
+
+    unpadded_features = current_height * current_width
+    newline_features = current_height
+    return (unpadded_features, newline_features)
+
+
+def _get_llava_next_image_feature_size(
+    hf_config: LlavaNextConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        num_patches = get_clip_patch_grid_length(
+            image_size=vision_config.image_size,
+            patch_size=vision_config.patch_size,
+        )
+        base_feature_size = num_patches * num_patches
+
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+            image_size=(input_height, input_width),
+            grid_pinpoints=hf_config.image_grid_pinpoints,
+            patch_size=vision_config.image_size,
+        )
+
+        (
+            unpadded_feature_size,
+            newline_feature_size,
+        ) = _get_llava_next_num_unpadded_features(input_height, input_width,
+                                                  num_patches,
+                                                  num_patch_height,
+                                                  num_patch_width)
+
+        return unpadded_feature_size + newline_feature_size + base_feature_size
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
 
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
+def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(LlavaNextConfig)
+    vision_config = hf_config.vision_config
+
+    #TODO: change the logic for dummy data to support dynamic shape
+    _, _, dummy_height, dummy_width = multimodal_config.image_input_shape
+    image_feature_size = _get_llava_next_image_feature_size(
+        hf_config, input_height=dummy_height, input_width=dummy_width)
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+            image_feature_size_override=image_feature_size,
+        )
+
+        image_input_type = multimodal_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+        mm_data: MultiModalData
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            mm_data = dummy_pixel_data_for_clip(
+                vision_config,
+                image_width_override=dummy_width,
+                image_height_override=dummy_height,
+            )
+        elif image_input_type == ImageInputType.IMAGE_FEATURES:
+            mm_data = dummy_feature_data_for_clip(
+                vision_config,
+                image_feature_size_override=image_feature_size,
+            )
 
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        _, c, h, w = vlm_config.image_input_shape
-        mode = {1: "L", 3: "RGB"}[c]
-        fake_mm_data = ImagePixelData(Image.new(mode, (w, h), color=0))
+        return seq_data, mm_data
 
-    return seq_data, fake_mm_data
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
 
 
-def _image_pixel_processor(
-    data: ImagePixelData,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Dict[str, torch.Tensor]:
+def _pixel_mapper(ctx: InputContext,
+                  data: ImagePixelData) -> Dict[str, torch.Tensor]:
     image = data.image
 
     if isinstance(image, torch.Tensor):
-        pixel_values = image.to(model_config.dtype)
+        pixel_values = image.to(ctx.model_config.dtype)
         batch_size, _, _, h, w = pixel_values.shape
         image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
 
         return {"pixel_values": pixel_values, "image_sizes": image_sizes}
 
     # Temporary patch before dynamic number of image tokens is supported
-    _, _, h, w = vlm_config.image_input_shape
+    _, _, h, w = ctx.get_multimodal_config().image_input_shape
     if (w, h) != (image.width, image.height):
         logger.warning(
             "Dynamic image shape is currently not supported. "
@@ -101,11 +180,12 @@ def _image_pixel_processor(
         data.image = image.resize((w, h))
 
     return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-        ._default_input_processor(data, model_config, vlm_config)
+        ._default_input_mapper(ctx, data)
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
-@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
+@MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_pixel_mapper)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0bbe93241..5d8ffd521 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -22,7 +22,8 @@ from PIL import Image
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -34,9 +35,10 @@ from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
+from vllm.multimodal.image import ImagePixelData
 from vllm.sequence import SamplerOutput
 
+from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsVision
 
 logger = init_logger(__name__)
@@ -107,7 +109,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
         self.num_img_tokens = config.img_processor['num_img_tokens']
 
         self.image_dim_out = image_dim_out
-        self.img_sizes = None
 
         # global_gn and sub_gn for hd transform, serves as line separator
         self.use_hd_transform = config.embd_layer.get('use_hd_transform',
@@ -134,7 +135,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
         self.img_projection = nn.Sequential(*layers)
 
         self.vocab_size = config.vocab_size
-        self.img_features = None
 
         self.layer_idx = config.img_processor.get('layer_idx', -2)
         self.type_feature = config.img_processor.get('type_feature', 'patch')
@@ -260,9 +260,44 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported
-# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
-def calc_padded_size(width, height, padding_unit=336):
+def _get_phi3v_image_feature_size(
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    h, w = input_height, input_width
+
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L178
+    return (h // 336 * w // 336 + 1) * 144 + 1 + (h // 336 + 1) * 12
+
+
+def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+
+    #TODO: change the logic for dummy data to support dynamic shape
+    _, _, dummy_height, dummy_width = multimodal_config.image_input_shape
+    image_feature_size = _get_phi3v_image_feature_size(
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
+
+    seq_data = dummy_seq_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        seq_len,
+        image_token_id=32044,
+        image_feature_size_override=image_feature_size,
+    )
+    mm_data = dummy_pixel_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        image_width_override=dummy_width,
+        image_height_override=dummy_height,
+    )
+
+    return seq_data, mm_data
+
+
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
     target_height = int(np.ceil(height / padding_unit) * padding_unit)
     top_padding = int((target_height - height) / 2)
     bottom_padding = target_height - height - top_padding
@@ -271,8 +306,8 @@ def calc_padded_size(width, height, padding_unit=336):
     return padded_width, padded_height
 
 
-# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
-def calc_hd_transform_size(width, height, hd_num=16):
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
     transposed = False
     if width < height:
         width, height = height, width
@@ -287,7 +322,8 @@ def calc_hd_transform_size(width, height, hd_num=16):
     new_width = int(scale * 336)
     new_height = int(new_width / ratio)
 
-    padded_width, padded_height = calc_padded_size(new_width, new_height)
+    padded_width, padded_height = _calc_padded_size(width=new_width,
+                                                    height=new_height)
 
     if transposed:
         padded_width, padded_height = padded_height, padded_width
@@ -295,17 +331,15 @@ def calc_hd_transform_size(width, height, hd_num=16):
     return padded_width, padded_height
 
 
-def _image_processor(
-    data: ImagePixelData,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Dict[str, torch.Tensor]:
+def _image_processor(ctx: InputContext,
+                     data: ImagePixelData) -> Dict[str, torch.Tensor]:
     image = data.image
 
     if isinstance(image, Image.Image):
         # Temporary patch before dynamic number of image tokens is supported
-        _, _, h, w = vlm_config.image_input_shape
-        if (w, h) != calc_hd_transform_size(image.width, image.height):
+        _, _, h, w = ctx.get_multimodal_config().image_input_shape
+        if (w, h) != _calc_hd_transform_size(width=image.width,
+                                             height=image.height):
             logger.warning(
                 "Dynamic image shape is currently not supported. "
                 "Resizing input image to (%d, %d).", w, h)
@@ -313,11 +347,11 @@ def _image_processor(
             data.image = image.resize((w, h))
 
     return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-            ._default_input_processor(data, model_config, vlm_config)
+            ._default_input_mapper(ctx, data)
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor)
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_image_processor)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
 class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 270012e7d..20bd87b8c 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,14 @@
 from .base import MultiModalData, MultiModalPlugin
-from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
+from .registry import MultiModalRegistry
+
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""
+The global :class:`~MultiModalRegistry` is used by model runners to
+dispatch data processing according to its modality and the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
+"""
 
 __all__ = [
     "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 847752449..d47cdd559 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,7 +2,8 @@ from abc import ABC, abstractmethod
 from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
                     TypeVar)
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
+from vllm.inputs import InputContext
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -23,7 +24,7 @@ class MultiModalData:
 
     Finally, register the new plugin to
     :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
-    This enables models to call :meth:`MultiModalRegistry.register_input` for
+    This enables models to call :meth:`MultiModalRegistry.map_input` for
     the new modality.
     """
     pass
@@ -32,10 +33,9 @@ class MultiModalData:
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
-MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                    Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
-:meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
+:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
 
 
@@ -50,16 +50,9 @@ class MultiModalPlugin(ABC, Generic[D]):
     (i.e., the modality of the data).
     """
 
-    @classmethod
-    def get_model_cls(cls, model_config: ModelConfig) -> Type["nn.Module"]:
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        return get_model_architecture(model_config)[0]
-
     def __init__(self) -> None:
-        self._input_processors: Dict[Type["nn.Module"],
-                                     MultiModalInputProcessor[D]] = {}
+        self._input_mappers: Dict[Type["nn.Module"],
+                                  MultiModalInputMapper[D]] = {}
 
     @abstractmethod
     def get_data_type(self) -> Type[D]:
@@ -70,57 +63,62 @@ class MultiModalPlugin(ABC, Generic[D]):
         raise NotImplementedError
 
     @abstractmethod
-    def _default_input_processor(
-            self, data: D, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+    def _default_input_mapper(self, ctx: InputContext,
+                              data: D) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
-        :meth:`torch.nn.Module.forward`. This is similar in concept to
+        :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
         """
         raise NotImplementedError
 
-    def register_input_processor(self,
-                                 processor: Optional[
-                                     MultiModalInputProcessor[D]] = None):
+    def register_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper[D]] = None,
+    ):
         """
-        Register an input processor to a model class.
+        Register an input mapper to a model class.
         
         When the model receives input data that matches the modality served by
-        this plugin (see :meth:`get_data_type`), the provided input processor is
-        applied to preprocess the data. If `None` is provided, then the default
-        input processor is applied instead.
+        this plugin (see :meth:`get_data_type`), the provided function is
+        invoked to transform the data into a dictionary of model inputs.
+        If `None` is provided, then the default input mapper is used instead.
+
+        See also:
+            :ref:`input_processing_pipeline`
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_processors:
+            if model_cls in self._input_mappers:
                 logger.warning(
-                    "Model class %s already has an input processor "
+                    "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
-            self._input_processors[model_cls] = processor \
-                or self._default_input_processor
+            self._input_mappers[model_cls] = mapper \
+                or self._default_input_mapper
 
             return model_cls
 
         return wrapper
 
-    def process_input(
-            self, data: D, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+    def map_input(self, model_config: ModelConfig,
+                  data: D) -> Dict[str, "torch.Tensor"]:
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
-        to the model.
-        
-        The model is identified by ``model_config``. ``vlm_config`` is
-        for compatibility purposes and may be merged into ``model_config``
-        in the near future.
+        Apply an input mapper to a :class:`~MultiModalData` instance passed
+        to the model, transforming the data into a dictionary of model inputs.
+
+        The model is identified by ``model_config``.
+
+        TODO: Add guide [ref: PR #5276]
         """
-        model_cls = self.get_model_cls(model_config)
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
 
-        processor = self._input_processors.get(model_cls)
-        if processor is None:
-            raise KeyError(f"No input processor in {self} is registered for "
+        mapper = self._input_mappers.get(model_cls)
+        if mapper is None:
+            raise KeyError(f"No input mapper in {self} is registered for "
                            f"model class {model_cls.__name__}.")
 
-        return processor(data, model_config, vlm_config)
+        return mapper(InputContext(model_config), data)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 08fb09d11..a9691575c 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,70 +1,28 @@
-from typing import Dict, Tuple, Type, Union
+from functools import lru_cache
+from typing import Dict, Type, Union
 
 import torch
 from PIL import Image
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
-from vllm.sequence import SequenceData
-from vllm.transformers_utils.image_processor import cached_get_image_processor
+from vllm.transformers_utils.image_processor import get_image_processor
 
 from .base import MultiModalData, MultiModalPlugin
 
 logger = init_logger(__name__)
 
-
-def _get_dummy_seq_data(seq_len: int,
-                        vlm_config: VisionLanguageConfig) -> SequenceData:
-    # NOTE: We assume that <image> token is repeated `image_feature_size` times
-    # and then concatenated with the text prompt
-    # TODO: Enable other ways of inserting the image into the prompt
-
-    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
-    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
-
-    return SequenceData(token_ids)
-
-
-def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
-    if vlm_config.image_processor is None:
-        values_dtype = torch.float16
-    else:
-        values_dtype = torch.uint8
-
-    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
-
-
-def get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    """Standard dummy data factory for image data (to be used in
-    :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
-    seq_data = _get_dummy_seq_data(seq_len, vlm_config)
-    values = _get_dummy_values(vlm_config)
-
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
-
-    fake_mm_data: MultiModalData
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        fake_mm_data = ImagePixelData(values)
-    elif config_input_type == ImageInputType.IMAGE_FEATURES:
-        fake_mm_data = ImageFeatureData(values)
-    else:
-        raise NotImplementedError
-
-    return seq_data, fake_mm_data
+cached_get_image_processor = lru_cache(get_image_processor)
 
 
 class ImagePixelData(MultiModalData):
     """
     The pixel data of an image. Can be one of:
 
-    - :class:``PIL.Image``: An image object. Requires that a HuggingFace
+    - :class:`PIL.Image.Image`: An image object. Requires that a HuggingFace
       processor is available to the model.
-    - :class:``torch.Tensor``: The raw pixel data which is passed to the model
+    - :class:`torch.Tensor`: The raw pixel data which is passed to the model
       without additional pre-processing.
     """
 
@@ -89,8 +47,8 @@ class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
     def get_data_type(self) -> Type[ImagePixelData]:
         return ImagePixelData
 
-    def _get_hf_image_processor(self, model_config: ModelConfig,
-                                vlm_config: VisionLanguageConfig):
+    def _get_hf_image_processor(self, model_config: ModelConfig):
+        vlm_config = model_config.multimodal_config
         if vlm_config is None or vlm_config.image_processor is None:
             return None
 
@@ -100,14 +58,13 @@ class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
             revision=vlm_config.image_processor_revision,
         )
 
-    def _default_input_processor(
-            self, data: ImagePixelData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+    def _default_input_mapper(self, ctx: InputContext,
+                              data: ImagePixelData) -> Dict[str, torch.Tensor]:
+        model_config = ctx.model_config
         image = data.image
 
         if isinstance(image, Image.Image):
-            image_processor = self._get_hf_image_processor(
-                model_config, vlm_config)
+            image_processor = self._get_hf_image_processor(model_config)
             if image_processor is None:
                 raise RuntimeError("No HuggingFace processor is available"
                                    "to process the image object")
@@ -147,9 +104,10 @@ class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
     def get_data_type(self) -> Type[ImageFeatureData]:
         return ImageFeatureData
 
-    def _default_input_processor(
-            self, data: ImageFeatureData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+    def _default_input_mapper(
+            self, ctx: InputContext,
+            data: ImageFeatureData) -> Dict[str, torch.Tensor]:
+        model_config = ctx.model_config
         image_features = data.image_features.to(model_config.dtype)
 
         return {"image_features": image_features}
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 4789ce5ce..abc88e4f9 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,46 +1,35 @@
 import functools
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
-                    Tuple, Type, TypeVar)
+from typing import Any, Optional, Sequence, Type, TypeVar
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from torch import nn
+
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import MultiModalData, MultiModalPlugin
+from .base import MultiModalData, MultiModalInputMapper, MultiModalPlugin
 from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
                     ImagePixelPlugin)
 
-if TYPE_CHECKING:
-    import torch
-    from torch import nn
-
-    from vllm.sequence import SequenceData
-
 logger = init_logger(__name__)
 
 D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type["nn.Module"])
-
-MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                    Dict[str, "torch.Tensor"]]
-MultiModalDummyFactory = Callable[[int, ModelConfig, VisionLanguageConfig],
-                                  Tuple["SequenceData", MultiModalData]]
+N = TypeVar("N", bound=Type[nn.Module])
 
 
 class MultiModalRegistry:
     """
-    This registry is used by model runners to dispatch data processing
+    A registry to dispatch data processing
     according to its modality and the target model.
     """
 
     DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
 
-    def __init__(self,
-                 *,
-                 plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
-                 ) -> None:
+    def __init__(
+        self,
+        *,
+        plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS,
+    ) -> None:
         self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
-        self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
-                                                  MultiModalDummyFactory] = {}
 
     def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
         data_type = plugin.get_data_type()
@@ -62,95 +51,53 @@ class MultiModalRegistry:
         msg = f"Unknown multi-modal data type: {data_type}"
         raise NotImplementedError(msg)
 
-    def register_dummy_data(self, factory: MultiModalDummyFactory):
-        """
-        Register a dummy data factory to a model class.
-
-        During memory profiling, the provided function is invoked to create
-        dummy data to be inputted into the model. The modality and shape of
-        the dummy data should be an upper bound of what the model would receive
-        at inference time.
-        """
-
-        def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_factories_by_model_type:
-                logger.warning(
-                    "Model class %s already has dummy data "
-                    "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
-
-            self._dummy_factories_by_model_type[model_cls] = factory
-
-            return model_cls
-
-        return wrapper
-
-    def dummy_data_for_profiling(self, seq_len: int, model_config: ModelConfig,
-                                 vlm_config: VisionLanguageConfig):
-        """Create dummy data for memory profiling."""
-        model_cls = MultiModalPlugin.get_model_cls(model_config)
-        dummy_factory = self._dummy_factories_by_model_type.get(model_cls)
-        if dummy_factory is None:
-            msg = f"No dummy data defined for model class: {model_cls}"
-            raise NotImplementedError(msg)
-
-        return dummy_factory(seq_len, model_config, vlm_config)
-
-    def register_input(
-            self,
-            data_type: Type[D],
-            processor: Optional[MultiModalInputProcessor[D]] = None):
+    def register_input_mapper(
+        self,
+        data_type: Type[D],
+        mapper: Optional[MultiModalInputMapper[D]] = None,
+    ):
         """
-        Register an input processor for a specific modality to a model class.
+        Register an input mapper for a specific modality to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
         return self._get_plugin_for_data_type(data_type) \
-            .register_input_processor(processor)
+            .register_input_mapper(mapper)
 
-    def register_image_pixel_input(
-            self,
-            processor: Optional[
-                MultiModalInputProcessor[ImagePixelData]] = None):
+    def register_image_pixel_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper[ImagePixelData]] = None,
+    ):
         """
-        Register an input processor for image pixel data to a model class.
+        Register an input mapper for image pixel data to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input(ImagePixelData, processor)
+        return self.register_input_mapper(ImagePixelData, mapper)
 
-    def register_image_feature_input(
+    def register_image_feature_input_mapper(
         self,
-        processor: Optional[
-            MultiModalInputProcessor[ImageFeatureData]] = None):
+        mapper: Optional[MultiModalInputMapper[ImageFeatureData]] = None,
+    ):
         """
-        Register an input processor for image feature data to a model class.
+        Register an input mapper for image feature data to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input(ImageFeatureData, processor)
+        return self.register_input_mapper(ImageFeatureData, mapper)
 
-    def process_input(self, data: MultiModalData, model_config: ModelConfig,
-                      vlm_config: VisionLanguageConfig):
+    def map_input(self, model_config: ModelConfig, data: MultiModalData):
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
+        Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model.
         
-        See :meth:`MultiModalPlugin.process_input` for more details.
+        See :meth:`MultiModalPlugin.map_input` for more details.
         """
         return self._get_plugin_for_data_type(type(data)) \
-            .process_input(data, model_config, vlm_config)
+            .map_input(model_config, data)
 
-    def create_input_processor(self, model_config: ModelConfig,
-                               vlm_config: VisionLanguageConfig):
+    def create_input_mapper(self, model_config: ModelConfig):
         """
-        Create an input processor (see :meth:`process_input`) for a
-        specific model.
+        Create an input mapper (see :meth:`map_input`) for a specific model.
         """
-        return functools.partial(self.process_input,
-                                 model_config=model_config,
-                                 vlm_config=vlm_config)
-
-
-MULTIMODAL_REGISTRY = MultiModalRegistry()
-"""The global :class:`~MultiModalRegistry` which is used by model runners."""
+        return functools.partial(self.map_input, model_config)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index c618c3692..a50aaf420 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -8,12 +8,12 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
-from vllm.inputs import LLMInputs
 from vllm.lora.request import LoRARequest
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
+    from vllm.inputs import LLMInputs
     from vllm.multimodal import MultiModalData
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
@@ -221,7 +221,7 @@ class Sequence:
     def __init__(
         self,
         seq_id: int,
-        inputs: LLMInputs,
+        inputs: "LLMInputs",
         block_size: int,
         eos_token_id: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 3239b1d0c..2bb5215d4 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,4 +1,3 @@
-from functools import lru_cache
 from typing import Optional
 
 from transformers import AutoImageProcessor
@@ -40,6 +39,3 @@ def get_image_processor(
             raise e
 
     return processor
-
-
-cached_get_image_processor = lru_cache(get_image_processor)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 148332f34..e689f485e 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -110,15 +110,9 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
             self.block_size,
         )
 
-        # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
-                .create_input_processor(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_processor = None
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
@@ -168,13 +162,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
 
             mm_data = seq_group_metadata.multi_modal_data
             if mm_data is not None:
-                # Process multi-modal data
-                if self.multi_modal_input_processor is None:
-                    raise ValueError(
-                        "Multi-modal inputs are only supported by "
-                        "vision language models.")
-
-                mm_kwargs = self.multi_modal_input_processor(mm_data)
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
                 for k, v in mm_kwargs.items():
                     multi_modal_kwargs_list[k].append(v)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 181442490..93a10070d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -15,6 +15,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed.parallel_state import graph_capture
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -25,7 +26,7 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models.interfaces import supports_lora
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                         is_pin_memory_available, make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
@@ -191,15 +192,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             self.block_size,
         ) if num_attn_heads else None
 
-        # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
-                .create_input_processor(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_processor = None
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
 
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
@@ -506,12 +501,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 mm_data = seq_group_metadata.multi_modal_data
                 if mm_data is not None:
                     # Process multi-modal data
-                    if self.multi_modal_input_processor is None:
-                        raise ValueError(
-                            "Multi-modal inputs are only supported by "
-                            "vision language models.")
-
-                    mm_kwargs = self.multi_modal_input_processor(mm_data)
+                    mm_kwargs = self.multi_modal_input_mapper(mm_data)
                     for k, v in mm_kwargs.items():
                         multi_modal_kwargs_list[k].append(v)
 
@@ -764,12 +754,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
 
-            if vlm_config is None:
-                seq_data = SequenceData([0] * seq_len)
-                dummy_multi_modal_data = None
-            else:
-                seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
-                    .dummy_data_for_profiling(seq_len, model_config, vlm_config)
+            seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
+                .dummy_data_for_profiling(model_config, seq_len)
+            assert len(seq_data.prompt_token_ids) == seq_len
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
-- 
GitLab


From 593263440967a8065d528acb3ff88274fc22c778 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 28 Jun 2024 09:36:12 -0400
Subject: [PATCH 183/376] Unmark fused_moe config json file as executable
 (#5960)

---
 .../configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
old mode 100755
new mode 100644
-- 
GitLab


From 57f09a419c04ecec4718ea9d5be1e6f4a8cc336e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 28 Jun 2024 17:50:16 +0400
Subject: [PATCH 184/376] [Hardware][Intel] OpenVINO vLLM backend (#5379)

---
 .buildkite/run-openvino-test.sh               |  14 +
 Dockerfile.openvino                           |  26 ++
 benchmarks/benchmark_latency.py               |   7 +-
 benchmarks/benchmark_throughput.py            |   7 +-
 .../getting_started/openvino-installation.rst |  95 +++++
 docs/source/index.rst                         |   1 +
 requirements-openvino.txt                     |   9 +
 setup.py                                      |  11 +-
 tests/kernels/test_attention_selector.py      |   9 +-
 vllm/attention/backends/openvino.py           | 101 +++++
 vllm/attention/selector.py                    |  12 +-
 vllm/config.py                                |   8 +-
 vllm/engine/arg_utils.py                      |  14 +-
 vllm/engine/async_llm_engine.py               |   6 +
 vllm/engine/llm_engine.py                     |   3 +
 vllm/envs.py                                  |  22 +-
 vllm/executor/openvino_executor.py            | 163 ++++++++
 vllm/model_executor/layers/sampler.py         |   4 +-
 vllm/model_executor/model_loader/openvino.py  | 210 +++++++++++
 vllm/utils.py                                 |  11 +-
 vllm/worker/openvino_model_runner.py          | 330 ++++++++++++++++
 vllm/worker/openvino_worker.py                | 353 ++++++++++++++++++
 22 files changed, 1393 insertions(+), 23 deletions(-)
 create mode 100755 .buildkite/run-openvino-test.sh
 create mode 100644 Dockerfile.openvino
 create mode 100644 docs/source/getting_started/openvino-installation.rst
 create mode 100644 requirements-openvino.txt
 create mode 100644 vllm/attention/backends/openvino.py
 create mode 100644 vllm/executor/openvino_executor.py
 create mode 100644 vllm/model_executor/model_loader/openvino.py
 create mode 100644 vllm/worker/openvino_model_runner.py
 create mode 100644 vllm/worker/openvino_worker.py

diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
new file mode 100755
index 000000000..70e56596c
--- /dev/null
+++ b/.buildkite/run-openvino-test.sh
@@ -0,0 +1,14 @@
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
new file mode 100644
index 000000000..9861997b4
--- /dev/null
+++ b/Dockerfile.openvino
@@ -0,0 +1,26 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+FROM ubuntu:22.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git
+WORKDIR /workspace
+
+# copy requirements
+COPY requirements-build.txt /workspace/vllm/
+COPY requirements-common.txt /workspace/vllm/
+COPY requirements-openvino.txt /workspace/vllm/
+
+COPY vllm/ /workspace/vllm/vllm
+COPY setup.py /workspace/vllm/
+
+# install build requirements
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+# build vLLM with OpenVINO backend
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+
+COPY examples/ /workspace/vllm/examples
+COPY benchmarks/ /workspace/vllm/benchmarks
+
+CMD ["/bin/bash"]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index f3d00e456..a46ee1581 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -207,9 +207,10 @@ if __name__ == '__main__':
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
-        choices=["cuda", "cpu", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument('--block-size',
                         type=int,
                         default=16,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 2c6beb4e8..a52e67bbb 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -349,9 +349,10 @@ if __name__ == "__main__":
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
-        choices=["cuda", "cpu", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument(
         "--enable-prefix-caching",
         action='store_true',
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
new file mode 100644
index 000000000..0d8e0b680
--- /dev/null
+++ b/docs/source/getting_started/openvino-installation.rst
@@ -0,0 +1,95 @@
+.. _installation_openvino:
+
+Installation with OpenVINO
+==========================
+
+vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (``--enable-prefix-caching``)
+- Chunked prefill (``--enable-chunked-prefill``)
+
+**Table of contents**:
+
+- :ref:`Requirements <openvino_backend_requirements>`
+- :ref:`Quick start using Dockerfile <openvino_backend_quick_start_dockerfile>`
+- :ref:`Build from source <install_openvino_backend_from_source>`
+- :ref:`Performance tips <openvino_backend_performance_tips>`
+- :ref:`Limitations <openvino_backend_limitations>`
+
+.. _openvino_backend_requirements:
+
+Requirements
+------------
+
+* OS: Linux
+* Instruction set architecture (ISA) requirement: at least AVX2.
+
+.. _openvino_backend_quick_start_dockerfile:
+
+Quick start using Dockerfile
+----------------------------
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.openvino -t vllm-openvino-env .
+    $ docker run -it --rm vllm-openvino-env
+
+.. _install_openvino_backend_from_source:
+
+Install from source
+-------------------
+
+- First, install Python. For example, on Ubuntu 22.04, you can run:
+
+  .. code-block:: console
+
+      $ sudo apt-get update  -y
+      $ sudo apt-get install python3
+
+- Second, install prerequisites vLLM OpenVINO backend installation:
+
+  .. code-block:: console
+
+      $ pip install --upgrade pip
+      $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+- Finally, install vLLM with OpenVINO backend: 
+
+  .. code-block:: console
+
+      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+
+.. _openvino_backend_performance_tips:
+
+Performance tips
+----------------
+
+vLLM OpenVINO backend uses the following environment variables to control behavior:
+
+- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+
+- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
+
+- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off.
+
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
+
+OpenVINO best known configuration is:
+
+.. code-block:: console
+
+    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
+
+.. _openvino_backend_limitations:
+
+Limitations
+-----------
+
+- LoRA serving is not supported.
+
+- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
+
+- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
+
+- Speculative sampling is not tested within vLLM integration.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 3a9f5a3d8..8fd25ce82 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -63,6 +63,7 @@ Documentation
 
    getting_started/installation
    getting_started/amd-installation
+   getting_started/openvino-installation
    getting_started/cpu-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
new file mode 100644
index 000000000..e555d5257
--- /dev/null
+++ b/requirements-openvino.txt
@@ -0,0 +1,9 @@
+# Common dependencies
+-r requirements-common.txt
+
+# OpenVINO dependencies
+torch >= 2.1.2
+openvino ~= 2024.3.0.dev
+optimum-intel[openvino] >= 1.17.2
+
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/setup.py b/setup.py
index b2ae6def8..067ad13fe 100644
--- a/setup.py
+++ b/setup.py
@@ -233,6 +233,10 @@ def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
+def _is_openvino() -> bool:
+    return VLLM_TARGET_DEVICE == "openvino"
+
+
 def _is_xpu() -> bool:
     return VLLM_TARGET_DEVICE == "xpu"
 
@@ -337,6 +341,8 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
+    elif _is_openvino():
+        version += "+openvino"
     elif _is_tpu():
         version += "+tpu"
     elif _is_cpu():
@@ -388,6 +394,8 @@ def get_requirements() -> List[str]:
         requirements = _read_requirements("requirements-rocm.txt")
     elif _is_neuron():
         requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_openvino():
+        requirements = _read_requirements("requirements-openvino.txt")
     elif _is_tpu():
         requirements = _read_requirements("requirements-tpu.txt")
     elif _is_cpu():
@@ -396,7 +404,8 @@ def get_requirements() -> List[str]:
         requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "OpenVINO, or CPU.")
     return requirements
 
 
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 79e03c747..8e6c50666 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -9,8 +9,8 @@ from vllm.attention.selector import which_attn_to_use
 
 
 @pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
+@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
 def test_env(name: str, device: str, monkeypatch):
     """Test that the attention selector can be set via environment variable.
     Note that we do not test FlashAttn because it is the default backend.
@@ -28,6 +28,11 @@ def test_env(name: str, device: str, monkeypatch):
             backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                         torch.float16, 16)
         assert backend.name == "ROCM_FLASH"
+    elif device == "openvino":
+        with patch("vllm.attention.selector.is_openvino", return_value=True):
+            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
+                                        torch.float16, 16)
+        assert backend.name == "OPENVINO"
     else:
         backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                     torch.float16, 16)
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
new file mode 100644
index 000000000..0f21b50ad
--- /dev/null
+++ b/vllm/attention/backends/openvino.py
@@ -0,0 +1,101 @@
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import openvino as ov
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
+
+
+class OpenVINOAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "openvino"
+
+    @staticmethod
+    def get_impl_cls():
+        # OpenVINO implements PagedAttention as part of the Optimum
+        # exported model
+        raise NotImplementedError
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
+        raise NotImplementedError
+
+    @staticmethod
+    def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata":
+        return OpenVINOAttentionMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, num_kv_heads, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: ov.Tensor,
+        dst_kv_cache: ov.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        # OpenVINO currently supports only CPU, which does not require
+        # swap of KV cache blocks
+        raise NotImplementedError
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
+        src_to_dists: List[Tuple[int, int]],
+    ) -> None:
+        for src, dst in src_to_dists:
+            for key_cache, value_cache in kv_caches:
+                key_cache.data[dst, :] = key_cache.data[src, :]
+                value_cache.data[dst, :] = value_cache.data[src, :]
+
+
+@dataclass
+class OpenVINOAttentionMetadata:
+    """Metadata for OpenVINOAttentionBackend.
+
+    Basic terms used below:
+    - batch_size_in_sequences - total number of sequences to execute​
+    - prompt_lens – per sequence size number of scheduled tokens​
+    - batch_size_in_tokens = sum(prompt_lens)​
+    - max_context_len = max(context_lens)​
+    - max_num_blocks = div_up(max_context_len / BLOCK_SIZE)​
+    - num_blocks – total number of blocks in block_indices​
+    """
+
+    # Describes past KV cache size for each sequence within a batch
+    # Shape: [batch_size_in_sequences]
+    # Type: i32​
+    past_lens: torch.Tensor
+
+    # Describes start indices of input / speculative tokens from
+    # current sequences within a batch sequence​
+    # Shape: [batch_size_in_sequences + 1]​
+    # Type: i32
+    subsequence_begins: torch.Tensor
+
+    # Describes block tables for each sequence within a batch​ -
+    # indices along 0th dimension in key_cache and value_cache inputs​
+    # Shape: [num_blocks]
+    # Type: i32​
+    block_indices: torch.Tensor
+
+    # Describes block tables for each sequence within a batch​ -
+    # for i-th element, it is an index in block_indices with the
+    # first block belonging to i-th sequence​
+    # Shape: [batch_size_in_sequences + 1]
+    # Type: i32​
+    block_indices_begins: torch.Tensor
+
+    # Describes max context length
+    # Shape: scalar
+    # Type: i32
+    max_context_len: torch.Tensor
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 1d56d87cc..96f88bbf4 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -7,7 +7,7 @@ import torch
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu
+from vllm.utils import is_cpu, is_hip, is_openvino, is_tpu, is_xpu
 
 logger = init_logger(__name__)
 
@@ -17,6 +17,7 @@ class _Backend(enum.Enum):
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
     TORCH_SDPA = enum.auto()
+    OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
@@ -64,6 +65,10 @@ def get_attn_backend(
         logger.info("Using Torch SDPA backend.")
         from vllm.attention.backends.torch_sdpa import TorchSDPABackend
         return TorchSDPABackend
+    elif backend == _Backend.OPENVINO:
+        logger.info("Using OpenVINO Attention backend.")
+        from vllm.attention.backends.openvino import OpenVINOAttentionBackend
+        return OpenVINOAttentionBackend
     elif backend == _Backend.IPEX:
         assert is_xpu(), RuntimeError(
             "IPEX attention backend is only used for the XPU device.")
@@ -113,6 +118,11 @@ def which_attn_to_use(
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         return _Backend.TORCH_SDPA
 
+    if is_openvino():
+        if selected_backend != _Backend.OPENVINO:
+            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
+        return _Backend.OPENVINO
+
     if is_xpu():
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
diff --git a/vllm/config.py b/vllm/config.py
index 6adeaf420..31d30cfa7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -14,8 +14,8 @@ from vllm.model_executor.models import ModelRegistry
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_tpu, is_xpu, print_warning_once,
-                        update_environment_variables)
+                        is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
+                        print_warning_once, update_environment_variables)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -781,6 +781,8 @@ class DeviceConfig:
             # Automated device type detection
             if is_neuron():
                 self.device_type = "neuron"
+            elif is_openvino():
+                self.device_type = "openvino"
             elif is_tpu():
                 self.device_type = "tpu"
             elif is_cpu():
@@ -796,7 +798,7 @@ class DeviceConfig:
             self.device_type = device
 
         # Some device types require processing inputs on CPU
-        if self.device_type in ["neuron"]:
+        if self.device_type in ["neuron", "openvino"]:
             self.device = torch.device("cpu")
         elif self.device_type in ["tpu"]:
             self.device = None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c392155e8..f9d089091 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -504,12 +504,14 @@ class EngineArgs:
                   'Enabling this will use the fully sharded layers. '
                   'At high sequence length, max rank or '
                   'tensor parallel size, this is likely faster.'))
-        parser.add_argument(
-            "--device",
-            type=str,
-            default=EngineArgs.device,
-            choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu"],
-            help='Device type for vLLM execution.')
+        parser.add_argument("--device",
+                            type=str,
+                            default=EngineArgs.device,
+                            choices=[
+                                "auto", "cuda", "neuron", "cpu", "openvino",
+                                "tpu", "xpu"
+                            ],
+                            help='Device type for vLLM execution.')
 
         # Related to Vision-language models such as llava
         parser = EngineArgs.add_cli_args_for_vlm(parser)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 848e05f03..7db3bb28c 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -393,6 +393,12 @@ class AsyncLLMEngine:
                 "Distributed execution is not supported with the CPU backend.")
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
+        elif engine_config.device_config.device_type == "openvino":
+            assert distributed_executor_backend is None, (
+                "Distributed execution is not supported with "
+                "the OpenVINO backend.")
+            from vllm.executor.openvino_executor import OpenVINOExecutorAsync
+            executor_class = OpenVINOExecutorAsync
         elif engine_config.device_config.device_type == "xpu":
             if distributed_executor_backend is None:
                 from vllm.executor.xpu_executor import XPUExecutorAsync
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 9b720d613..fde18f60e 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -363,6 +363,9 @@ class LLMEngine:
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
+        elif engine_config.device_config.device_type == "openvino":
+            from vllm.executor.openvino_executor import OpenVINOExecutor
+            executor_class = OpenVINOExecutor
         elif engine_config.device_config.device_type == "xpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
diff --git a/vllm/envs.py b/vllm/envs.py
index 49277e2d3..e8257535f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -28,6 +28,9 @@ if TYPE_CHECKING:
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
+    VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
+    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
     VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
@@ -49,7 +52,8 @@ environment_variables: Dict[str, Callable[[], Any]] = {
 
     # ================== Installation Time Env Vars ==================
 
-    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
+    # Target device of vLLM, supporting [cuda (by default),
+    # rocm, neuron, cpu, openvino]
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
 
@@ -208,6 +212,22 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     "VLLM_CPU_KVCACHE_SPACE":
     lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
 
+    # OpenVINO key-value cache space
+    # default is 4GB
+    "VLLM_OPENVINO_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
+
+    # OpenVINO KV cache precision
+    # default is bf16 if natively supported by platform, otherwise f16
+    # To enable KV cache compression, please, explicitly specify u8
+    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
+    lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
+
+    # Enables weights compression during model export via HF Optimum
+    # default is False
+    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
+    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
+
     # If the env var is set, it uses the Ray's compiled DAG API
     # which optimizes the control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
new file mode 100644
index 000000000..8af375371
--- /dev/null
+++ b/vllm/executor/openvino_executor.py
@@ -0,0 +1,163 @@
+from typing import List, Set, Tuple
+
+import openvino as ov
+import openvino.properties.hint as hints
+import torch
+
+import vllm.envs as envs
+from vllm.config import CacheConfig, ModelConfig
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+logger = init_logger(__name__)
+
+
+class OpenVINOExecutor(ExecutorBase):
+
+    def _init_executor(self) -> None:
+        assert self.device_config.device_type == "openvino"
+        assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
+        self.model_config = _verify_and_get_model_config(self.model_config)
+        self.cache_config = _verify_and_get_cache_config(self.cache_config)
+
+        # Instantiate the worker and load the model to CPU.
+        self._init_worker()
+
+    def _init_worker(self):
+        from vllm.worker.openvino_worker import OpenVINOWorker
+
+        assert (
+            self.parallel_config.world_size == 1
+        ), "OpenVINOExecutor only supports single CPU socket currently."
+
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = OpenVINOWorker(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=True,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: We log here to avoid multiple logs when number of workers is
+        # greater than one. We could log in the engine, but not all executors
+        # have GPUs.
+        # NOTE: `cpu block` for OpenVINO backend is located on CPU memory but is
+        # referred as `gpu block`. Because we want to reuse the existing block
+        # management procedure.
+        logger.info("# CPU blocks: %d", num_gpu_blocks)
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.driver_worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
+
+    def check_health(self) -> None:
+        # OpenVINOExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
+        return output
+
+    async def check_health_async(self) -> None:
+        # OpenVINOExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+    if config.dtype != torch.float32:
+        logger.warning(
+            f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}."  # noqa: G004, E501
+        )
+        config.dtype = torch.float32
+    if not config.enforce_eager:
+        logger.warning(
+            "CUDA graph is not supported on OpenVINO backend, fallback to the "
+            "eager mode.")
+        config.enforce_eager = True
+    return config
+
+
+def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
+    if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
+        logger.info("KV cache type is overried to u8 via "
+                    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
+        config.cache_dtype = ov.Type.u8
+    else:
+        core = ov.Core()
+        inference_precision = core.get_property("CPU",
+                                                hints.inference_precision)
+        if inference_precision == ov.Type.bf16:
+            config.cache_dtype = ov.Type.bf16
+        else:
+            config.cache_dtype = ov.Type.f16
+
+    if config.block_size != 32:
+        logger.info(
+            f"OpenVINO optimal block size is 32, overriding currently set {config.block_size}"  # noqa: G004, E501
+        )
+        config.block_size = 32
+
+    kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
+    if kv_cache_space >= 0:
+        _GB = 1 << 30
+        if kv_cache_space == 0:
+            config.openvino_kvcache_space_bytes = 4 * _GB  # type: ignore
+            logger.warning(
+                "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
+                "for OpenVINO backend is not set, using 4 by default.")
+        else:
+            config.openvino_kvcache_space_bytes = kv_cache_space * _GB  # type: ignore
+    else:
+        raise RuntimeError(
+            "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
+            f" {kv_cache_space}, expect a positive integer value.")
+
+    return config
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index e07360a2f..6d00ea64f 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -679,7 +679,7 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
 
     Returns:
         torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens.
-                    Each element in the returned tensor represents the rank 
+                    Each element in the returned tensor represents the rank
                     of the chosen token in the input logprob tensor.
     """
     vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype),
@@ -965,7 +965,7 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
                 distribution.
             - Greedy sampling performs `argmax` to obtain the token with the
                 highest likelihood.
-    
+
     Ignoring greedy sampling for a moment, we find that the computed probability
     distribution has the following property: we can sample from it independently
     and find that the token sampled by the Sampler has a frequency corresponding
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
new file mode 100644
index 000000000..5c522a617
--- /dev/null
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -0,0 +1,210 @@
+# ruff: noqa: SIM117
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import openvino as ov
+import torch
+from huggingface_hub import HfApi
+from openvino._offline_transformations import paged_attention_transformation
+from optimum.intel import OVModelForCausalLM
+from torch import nn
+
+import vllm.envs as envs
+from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
+from vllm.config import DeviceConfig, ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
+                                                         _prune_hidden_states)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+
+logger = init_logger(__name__)
+
+
+def _flattenize_inputs(inputs):
+    """
+    Helper function for making nested inputs flattens
+    """
+    flatten_inputs = []
+    for input_data in inputs:
+        if input_data is None:
+            continue
+        if isinstance(input_data, (list, tuple)):
+            flatten_inputs.extend(_flattenize_inputs(input_data))
+        elif isinstance(input_data, dict):
+            flatten_inputs.extend(_flattenize_inputs(list(
+                input_data.values())))
+        else:
+            flatten_inputs.append(input_data)
+    return flatten_inputs
+
+
+def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
+                             is_cpu: bool):
+    # Apply hardware dependent modifications to KV tensors
+    for parameter in model.get_parameters():
+        input = parameter.get_output_tensor(0)
+        input_names = input.get_names()
+        if len(input_names) != 1:
+            continue
+        input_name = next(iter(input_names))
+        shape = parameter.get_partial_shape()
+        # use real block size if available, just a placeholder
+        # to provide the expected rank
+        x_size = 1
+        num_blocks = ov.Dimension()
+        block_size = ov.Dimension()
+        head_size = ov.Dimension()
+        # TODO: Negotiate required layout with plugins (CPU is ~OK, GPU is TBD),
+        # pass more parameters to this function to set more static dimensions
+        if input_name.startswith("key_cache."):
+            cpu_shape = [num_blocks, shape[1], block_size, head_size]
+            gpu_shape = [
+                num_blocks,
+                shape[1],
+                shape[2].get_length() //
+                x_size if shape[2].is_static else ov.Dimension(),
+                block_size,
+                x_size,
+            ]
+        elif input_name.startswith("value_cache."):
+            cpu_shape = [num_blocks, shape[1], block_size, head_size]
+            gpu_shape = [num_blocks, shape[1], shape[2], block_size]
+        else:
+            continue
+        parameter.set_partial_shape(
+            ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
+        parameter.set_element_type(kv_cache_dtype)
+    model.validate_nodes_and_infer_types()
+
+
+def _require_model_export(model_id, revision=None, subfolder=None):
+    model_dir = Path(model_id)
+    if subfolder is not None:
+        model_dir = model_dir / subfolder
+    if model_dir.is_dir():
+        return (not (model_dir / "openvino_model.xml").exists()
+                or not (model_dir / "openvino_model.bin").exists())
+
+    hf_api = HfApi()
+    try:
+        model_info = hf_api.model_info(model_id, revision=revision or "main")
+        normalized_subfolder = (None if subfolder is None else
+                                Path(subfolder).as_posix())
+        model_files = [
+            file.rfilename for file in model_info.siblings
+            if normalized_subfolder is None
+            or file.rfilename.startswith(normalized_subfolder)
+        ]
+        ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
+                         else f"{normalized_subfolder}/openvino_model.xml")
+        return (ov_model_path not in model_files
+                or ov_model_path.replace(".xml", ".bin") not in model_files)
+    except Exception:
+        return True
+
+
+class OpenVINOCasualLM(nn.Module):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+        kv_cache_dtype: ov.Type,
+    ) -> None:
+        super().__init__()
+        self.logits_processor = LogitsProcessor(
+            model_config.hf_config.vocab_size, logits_as_input=True)
+        self.sampler = Sampler()
+
+        export = _require_model_export(model_config.model)
+        if export:
+            logger.warning(
+                f"Provided model id {model_config.model} does not "  # noqa: G004
+                "contain OpenVINO IR, the model will be converted to IR with "
+                "default options. If you need to use specific options for "
+                "model conversion, use optimum-cli export openvino with "
+                "desired options.")
+        else:
+            logger.warning(
+                "OpenVINO IR is available for provided model id "  # noqa: G004
+                f"{model_config.model}. This IR will be used for inference "
+                "as-is, all possible options that may affect model conversion "
+                "are ignored.")
+
+        load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
+        pt_model = OVModelForCausalLM.from_pretrained(
+            model_config.model,
+            export=export,
+            compile=False,
+            load_in_8bit=load_in_8bit,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+
+        paged_attention_transformation(pt_model.model)
+        _modify_cache_parameters(pt_model.model, kv_cache_dtype,
+                                 device_config.device.type == "cpu")
+
+        core = ov.Core()
+        ov_compiled = core.compile_model(pt_model.model, "CPU")
+        self.ov_request = ov_compiled.create_infer_request()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
+        attn_metadata: OpenVINOAttentionMetadata,
+    ) -> torch.Tensor:
+        flatten_kv_cache = _flattenize_inputs(kv_caches)
+
+        inputs = [
+            input_ids,
+            positions,
+            *flatten_kv_cache,
+            attn_metadata.past_lens,
+            attn_metadata.subsequence_begins,
+            attn_metadata.block_indices,
+            attn_metadata.block_indices_begins,
+            attn_metadata.max_context_len,
+        ]
+
+        self.ov_request.start_async(inputs, share_inputs=True)
+        self.ov_request.wait()
+
+        logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
+
+        # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
+        return logits.view(-1, logits.shape[-1])
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
+        logits = self.logits_processor(None, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+
+def get_model(
+    model_config: ModelConfig,
+    device_config: DeviceConfig,
+    kv_cache_dtype: ov.Type,
+    **kwargs,
+) -> torch.nn.Module:
+    lora_config = kwargs.get("lora_config", None)
+    if lora_config:
+        raise ValueError(
+            "OpenVINO modeling does not support LoRA, "
+            "but LoRA is enabled. Support for this model may "
+            "be added in the future. If this is important to you, "
+            "please open an issue on github.")
+
+    return OpenVINOCasualLM(model_config, device_config, kv_cache_dtype)
diff --git a/vllm/utils.py b/vllm/utils.py
index 92abdb3fb..6e8d4624c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -176,6 +176,15 @@ def is_cpu() -> bool:
         return False
 
 
+@lru_cache(maxsize=None)
+def is_openvino() -> bool:
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return "openvino" in version("vllm")
+    except PackageNotFoundError:
+        return False
+
+
 @lru_cache(maxsize=None)
 def is_neuron() -> bool:
     try:
@@ -546,7 +555,7 @@ def is_pin_memory_available() -> bool:
     elif is_neuron():
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
-    elif is_cpu():
+    elif is_cpu() or is_openvino():
         return False
     return True
 
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
new file mode 100644
index 000000000..336eaf814
--- /dev/null
+++ b/vllm/worker/openvino_model_runner.py
@@ -0,0 +1,330 @@
+from typing import List, NamedTuple, Optional, Tuple
+
+import openvino as ov
+import torch
+from torch import nn
+
+from vllm.attention import get_attn_backend
+from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.model_loader.openvino import get_model
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+
+logger = init_logger(__name__)
+
+
+class ModelInput(NamedTuple):
+    input_tokens: torch.Tensor
+    input_positions: torch.Tensor
+    attn_metadata: Optional[OpenVINOAttentionMetadata]
+    seq_lens: List[int]
+    query_lens: List[int]
+    multi_modal_input: Optional[torch.Tensor]
+
+    @classmethod
+    def empty(cls, device):
+        return ModelInput(input_tokens=torch.empty(0, device=device),
+                          input_positions=torch.empty(0, device=device),
+                          attn_metadata=None,
+                          seq_lens=[],
+                          query_lens=[],
+                          multi_modal_input=None)
+
+
+class OpenVINOModelRunner:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        *args,
+        **kwargs,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.vision_language_config = vision_language_config
+        self.load_config = load_config
+        self.is_driver_worker = is_driver_worker
+
+        self.device = self.device_config.device
+
+        self.kv_cache_dtype = kv_cache_dtype
+        self.sliding_window = model_config.get_sliding_window()
+        self.block_size = cache_config.block_size
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.model_config.get_head_size(),
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+        )
+
+        # Lazy initialization.
+        self.model: nn.Module  # Set after init_Model
+
+    def load_model(self) -> None:
+        self.model = get_model(
+            model_config=self.model_config,
+            device_config=self.device_config,
+            kv_cache_dtype=self.kv_cache_dtype,
+        )
+
+    def _prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> ModelInput:
+        """Prepare the model input based on a given sequence group.
+
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+        """
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+
+        seq_lens: List[int] = []
+        past_lens: List[int] = []
+        query_lens: List[int] = []
+        subsequence_begins: List[int] = []
+        block_indices: List[int] = []
+        block_indices_begins: List[int] = []
+
+        # initialize beginning of prefix sums
+        subsequence_begins.append(0)
+        block_indices_begins.append(0)
+
+        if len(seq_group_metadata_list) == 0:
+            return ModelInput.empty(self.device)
+
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            is_prompt = seq_group_metadata.is_prompt
+
+            for seq_id in seq_ids:
+                computed_block_nums = seq_group_metadata.computed_block_nums
+                if (self.scheduler_config is not None
+                        and self.scheduler_config.chunked_prefill_enabled
+                        and not (computed_block_nums is None
+                                 or computed_block_nums == [])):
+                    raise RuntimeError(
+                        "chunked prefill cannot be used with prefix caching "
+                        "now.")
+
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                if is_prompt:
+                    computed_len = seq_data.get_num_computed_tokens()
+                else:
+                    # get_num_computed_tokens is incorrect for spec decoding.
+                    # So, we should have a special logic here.
+                    # TODO(sang): Fix it.
+                    computed_len = seq_data.get_len() - 1
+
+                seq_len = min(
+                    seq_data.get_len(),
+                    computed_len + seq_group_metadata.token_chunk_size,
+                )
+                if is_prompt:
+                    tokens = seq_data.get_token_ids()[computed_len:seq_len]
+                else:
+                    # Optimization. get_token_ids requires the entire copy of
+                    # tokens.
+                    tokens = [seq_data.get_last_token_id()]
+
+                # Prefix cache was hit.
+                # Prefix is not supported with sliding_window
+                prefix_cache_hit = (computed_block_nums is not None
+                                    and len(computed_block_nums) > 0
+                                    and self.sliding_window is None
+                                    and is_prompt)
+
+                block_table = seq_group_metadata.block_tables[seq_id]
+                # TODO(sang): Combine chunked prefill and prefix caching by
+                # only allowing multiple of block_size chunk size.
+                # NOTE: This only works for oooooooxxx style attention.
+                if prefix_cache_hit:
+                    assert computed_block_nums is not None
+                    computed_len = len(computed_block_nums) * self.block_size
+                    tokens = tokens[computed_len:]
+                elif (self.scheduler_config.chunked_prefill_enabled
+                      or not is_prompt):
+                    if seq_group_metadata.block_tables is not None:
+                        # chunked prefill or decode
+                        block_table = seq_group_metadata.block_tables[seq_id]
+                        if self.sliding_window is not None:
+                            # chunked prefill doesn't support sliding window.
+                            assert not self.scheduler_config.chunked_prefill_enabled  # noqa: E501
+                            sliding_window_blocks = (self.sliding_window //
+                                                     self.block_size)
+                            block_table = block_table[-sliding_window_blocks:]
+                    else:
+                        # Only happens when memory profiling runs.
+                        block_table = []
+                else:
+                    # prompt phase w/o prefix_caching, chunked_prefill
+                    pass
+
+                block_indices.extend(block_table)
+                block_indices_begins.append(block_indices_begins[-1] +
+                                            len(block_table))
+
+                # TODO(sang): This is a hack to make sliding window work with
+                # paged attn. We can remove it if we make paged attn kernel
+                # to properly handle slinding window attn.
+                if self.sliding_window is not None and not is_prompt:
+                    seq_len = min(seq_len, self.sliding_window)
+                    computed_len = seq_len - 1
+
+                seq_lens.append(seq_len)
+
+                query_len = seq_len - computed_len
+                query_lens.append(query_len)
+
+                input_tokens.extend(tokens)
+                input_positions.extend(list(range(computed_len, seq_len)))
+
+                past_lens.append(computed_len)
+                subsequence_begins.append(subsequence_begins[-1] + query_len)
+
+                if is_prompt:
+                    assert len(seq_ids) == 1
+                else:
+                    assert (
+                        query_len == 1
+                    ), "seq_len: {}, computed_len: {}, query_len: {}".format(
+                        seq_len, computed_len, query_len)
+
+        max_query_len = max(query_lens)
+        assert max_query_len > 0, "query_lens: {}".format(query_lens)
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)  # type: ignore
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)  # type: ignore
+
+        past_lens_tensor = torch.tensor(past_lens,
+                                        dtype=torch.int32,
+                                        device=self.device)  # type: ignore
+        subsequence_begins_tensor = torch.tensor(
+            subsequence_begins, dtype=torch.int32,
+            device=self.device)  # type: ignore
+        block_indices_tensor = torch.tensor(block_indices,
+                                            dtype=torch.int32,
+                                            device=self.device)  # type: ignore
+        block_indices_begins_tensor = torch.tensor(
+            block_indices_begins, dtype=torch.int32,
+            device=self.device)  # type: ignore
+
+        max_context_len = max(seq_lens)
+        max_context_len_tensor = torch.tensor(
+            max_context_len, dtype=torch.int32,
+            device=self.device)  # type: ignore
+
+        attn_metadata = self.attn_backend.make_openvino_metadata(
+            past_lens=past_lens_tensor,
+            subsequence_begins=subsequence_begins_tensor,
+            block_indices=block_indices_tensor,
+            block_indices_begins=block_indices_begins_tensor,
+            max_context_len=max_context_len_tensor,
+        )
+        return ModelInput(
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            seq_lens,
+            query_lens,
+            None,
+        )
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
+               SamplingMetadata, Optional[torch.Tensor], ]:
+        multi_modal_input = None
+
+        # Prepare input tensors.
+        (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            seq_lens,
+            query_lens,
+            multi_modal_input,
+        ) = self._prepare_model_input(seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens,
+            self.device,
+            pin_memory=False,
+        )
+
+        return (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            sampling_metadata,
+            multi_modal_input,
+        )
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]],
+    ) -> Optional[SamplerOutput]:
+        (
+            input_tokens,
+            input_positions,
+            attn_metadata,
+            sampling_metadata,
+            multi_modal_input,
+        ) = self.prepare_input_tensors(seq_group_metadata_list)
+
+        model_executable = self.model
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+        return output
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
new file mode 100644
index 000000000..7a462ce5d
--- /dev/null
+++ b/vllm/worker/openvino_worker.py
@@ -0,0 +1,353 @@
+"""An OpenVINO worker class."""
+from typing import Any, Dict, List, Optional, Tuple
+
+import openvino as ov
+import torch
+import torch.distributed
+
+from vllm.attention import get_attn_backend
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.distributed import (broadcast_tensor_dict,
+                              ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.worker.openvino_model_runner import OpenVINOModelRunner
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+
+logger = init_logger(__name__)
+
+
+class OpenVINOCacheEngine:
+    """Manages the KV cache for OpenVINO backend.
+
+    This class is responsible for initializing and managing CPU KV
+    caches. It also provides methods for performing KV cache operations, such
+    as copying.
+    """
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        device_config: DeviceConfig,
+    ) -> None:
+        assert device_config.device_type == "openvino"
+        self.cache_config = cache_config
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+
+        self.head_size = model_config.get_head_size()
+        if device_config.device.type == "cpu" and \
+            cache_config.cache_dtype == ov.Type.u8:
+            # Scale, zero point and quantized data will be stored together.
+            # The layout for per token per head:
+            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
+            # so, we have to extend head_size by 8, which is sizeof(float)
+            # for scale and sizeof(float) for zeropoint
+            self.head_size += 8
+        self.num_layers = model_config.get_num_layers(parallel_config)
+        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+
+        self.block_size = cache_config.block_size
+        # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
+        # for OpenVINO backend, because we want to reuse KV cache management
+        # in the scheduler.
+        self.num_cpu_blocks = cache_config.num_gpu_blocks
+
+        # Get attention backend.
+        self.attn_backend = get_attn_backend(
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.head_size,
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.cache_config.cache_dtype,
+            self.block_size,
+        )
+
+        # Initialize the cache.
+        self.kv_cache: List[Tuple[ov.Tensor,
+                                  ov.Tensor]] = self._allocate_kv_cache(
+                                      self.num_cpu_blocks)
+
+    def _allocate_kv_cache(
+        self,
+        num_blocks: int,
+    ) -> List[Tuple[ov.Tensor, ov.Tensor]]:
+        """Allocates KV cache."""
+        k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
+        kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
+        for _ in range(self.num_layers):
+            key_blocks = ov.Tensor(self.cache_config.cache_dtype,
+                                   k_block_shape)
+            value_blocks = ov.Tensor(self.cache_config.cache_dtype,
+                                     v_block_shape)
+            kv_cache.append((key_blocks, value_blocks))
+        return kv_cache
+
+    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
+        raise NotImplementedError(
+            "Swap is not supported in OpenVINOCacheEngine.")
+
+    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
+        raise NotImplementedError(
+            "Swap is not supported in OpenVINOCacheEngine.")
+
+    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
+        self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts)
+
+    @staticmethod
+    def get_cache_block_size(
+        block_size: int,
+        cache_dtype: ov.Type,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+    ) -> int:
+        head_size = model_config.get_head_size()
+        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        num_layers = model_config.get_num_layers(parallel_config)
+
+        if cache_dtype == ov.Type.u8:
+            # Scale, zero point and quantized data will be stored together.
+            # The layout for per token per head:
+            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
+            # so, we have to extend head_size by 8, which is sizeof(float)
+            # for scale and sizeof(float) for zeropoint
+            head_size += 8
+
+        key_cache_block = block_size * num_kv_heads * head_size
+        value_cache_block = key_cache_block
+        total = num_layers * (key_cache_block + value_cache_block)
+        dtype_size = cache_dtype.size
+        return dtype_size * total
+
+
+class OpenVINOWorker(LoraNotSupportedWorkerBase):
+    """A worker class that executes the model on OpenVINO backend.
+
+    Each worker is associated with a single OpenVINO device. The worker is
+    responsible for maintaining the KV cache and executing the model on the
+    OpenVINO backend.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+        kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
+        is_driver_worker: bool = False,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.vision_language_config = vision_language_config
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+
+            init_cached_hf_modules()
+        self.model_runner = OpenVINOModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            cache_config,
+            load_config=self.load_config,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+        )
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: OpenVINOCacheEngine
+        self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]]
+
+    def init_device(self) -> None:
+        self.init_distributed_environment()
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of blocks available for the KV cache.
+
+        This determines how many KV blocks can fit into the configured
+        KV cache space.
+
+        Note that since vLLM assumes a block resides on GPU if it can be
+        modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0.
+        This allows us to reuse the scheduler of vLLM without generalizing it
+        to different devices.
+        """
+        # For OpenVINO backend, the block number will be calculated based on the
+        # openvino_kvcache_space_bytes.
+        cache_block_size = self.get_cache_block_size_bytes()
+        num_cpu_blocks = int(self.cache_config.openvino_kvcache_space_bytes //
+                             cache_block_size)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+
+        # Note: To reuse the cache management procedure,
+        # use cpu cache as 'gpu cache'.
+        num_gpu_blocks = num_cpu_blocks
+        num_cpu_blocks = 0
+        return num_gpu_blocks, num_cpu_blocks
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache. Currently, swappable CPU memory is not
+        supported.
+
+        Since this worker does not support GPUs, we use the num_gpu_blocks to
+        determine how many non-swappable CPU blocks to allocate.
+        """
+        assert (num_cpu_blocks == 0
+                ), f"{type(self)} does not support swappable cache"
+
+        # Note: To reuse the cache management procedure,
+        # use cpu cache as 'gpu cache'.
+        num_cpu_blocks = num_gpu_blocks
+
+        self._validate_num_cpu_blocks(num_cpu_blocks)
+        self.cache_config.num_gpu_blocks = num_cpu_blocks
+        self.cache_config.num_cpu_blocks = 0
+
+        # Initialize the cache.
+        self._init_cache_engine()
+
+    def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
+        """Raise errors if the num_cpu_blocks is invalid."""
+        if num_cpu_blocks <= 0:
+            raise ValueError(
+                "No available memory for the cache blocks. "
+                "Try increasing `VLLM_OPENVINO_KVCACHE_SPACE` when "
+                "initializing the engine.")
+
+        max_seq_len = self.cache_config.block_size * num_cpu_blocks
+        if self.model_config.max_model_len > max_seq_len:
+            raise ValueError(
+                f"The model's max seq len ({self.model_config.max_model_len}) "
+                "is larger than the maximum number of tokens that can be "
+                f"stored in KV cache ({max_seq_len}). Try increasing "
+                "`VLLM_OPENVINO_KVCACHE_SPACE` or decreasing `max_model_len` "
+                "when initializing the engine.")
+
+    def _init_cache_engine(self) -> None:
+        self.cache_engine = OpenVINOCacheEngine(
+            self.cache_config,
+            self.model_config,
+            self.parallel_config,
+            self.device_config,
+        )
+        self.kv_cache = self.cache_engine.kv_cache
+        self.model_runner.block_size = self.cache_engine.block_size
+
+        assert self.kv_cache is not None
+
+        # Populate the cache to warmup the memory
+        for key_cache, value_cache in self.kv_cache:
+            key_cache.data[:] = 0
+            value_cache.data[:] = 0
+
+    def cache_copy(
+        self,
+        blocks_to_copy: List[Tuple[int, int]],
+    ) -> None:
+        self.cache_engine.copy(blocks_to_copy)  # type: ignore
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> List[SamplerOutput]:
+        if execute_model_req is None:
+            seq_group_metadata_list = None
+        else:
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
+        if self.is_driver_worker:
+            assert seq_group_metadata_list is not None
+            num_seq_groups: int = len(seq_group_metadata_list)
+            assert execute_model_req is not None
+            blocks_to_copy = execute_model_req.blocks_to_copy
+            assert len(execute_model_req.blocks_to_swap_in) == 0
+            assert len(execute_model_req.blocks_to_swap_out) == 0
+            data: Dict[str, Any] = {
+                "num_seq_groups": num_seq_groups,
+                "blocks_to_copy": execute_model_req.blocks_to_copy,
+            }
+            broadcast_tensor_dict(data, src=0)
+        else:
+            data = broadcast_tensor_dict(src=0)
+            num_seq_groups = data["num_seq_groups"]
+            blocks_to_copy = data["blocks_to_copy"]
+
+        self.cache_copy(blocks_to_copy)
+
+        # If there is no input, we don't need to execute the model.
+        if num_seq_groups == 0:
+            return []
+
+        output = self.model_runner.execute_model(seq_group_metadata_list,
+                                                 self.kv_cache)
+
+        # OpenVINO worker only supports single-step execution.
+        return [output]
+
+    def init_distributed_environment(self) -> None:
+        """Initialize the distributed environment."""
+
+        parallel_config = self.parallel_config
+        rank = self.rank
+        distributed_init_method = self.distributed_init_method
+        init_distributed_environment(
+            world_size=parallel_config.world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            backend="gloo",
+        )
+
+        # A small all_reduce for warmup.
+        torch.distributed.all_reduce(torch.zeros(1).cpu())
+
+        ensure_model_parallel_initialized(
+            parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size,
+        )
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Return the size in bytes of a single KV cache block."""
+        return OpenVINOCacheEngine.get_cache_block_size(
+            self.cache_config.block_size,
+            self.cache_config.cache_dtype,
+            self.model_config,
+            self.parallel_config,
+        )
-- 
GitLab


From ec1ad0046c6cf356443ff05a63f88a8d650844df Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 28 Jun 2024 16:42:17 +0200
Subject: [PATCH 185/376] [Bugfix] Better error message for MLPSpeculator when
 `num_speculative_tokens` is set too high (#5894)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 31d30cfa7..05bc57062 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -956,9 +956,9 @@ class SpeculativeConfig:
                     # Verify provided value doesn't exceed the maximum
                     # supported by the draft model.
                     raise ValueError(
-                        "Expected both speculative_model and "
-                        "num_speculative_tokens to be provided, but found "
-                        f"{speculative_model=} and {num_speculative_tokens=}.")
+                        "This speculative model supports a maximum of "
+                        f"num_speculative_tokens={n_predict}, but "
+                        f"{num_speculative_tokens=} was provided.")
 
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
-- 
GitLab


From 3b752a6555a67bb4863ea821fc30fdda38b27355 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 22:59:18 +0800
Subject: [PATCH 186/376] [CI/Build] [2/3] Reorganize entrypoints tests (#5904)

---
 .../test_llm_generate_multiple_loras.py       |   2 +-
 tests/entrypoints/test_openai_chat.py         |   2 +-
 tests/entrypoints/test_openai_completion.py   | 650 ++++++++++++++++++
 tests/entrypoints/test_openai_server.py       | 597 +---------------
 4 files changed, 653 insertions(+), 598 deletions(-)
 create mode 100644 tests/entrypoints/test_openai_completion.py

diff --git a/tests/entrypoints/test_llm_generate_multiple_loras.py b/tests/entrypoints/test_llm_generate_multiple_loras.py
index b429b904c..176daa472 100644
--- a/tests/entrypoints/test_llm_generate_multiple_loras.py
+++ b/tests/entrypoints/test_llm_generate_multiple_loras.py
@@ -44,7 +44,7 @@ def llm():
     cleanup()
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
diff --git a/tests/entrypoints/test_openai_chat.py b/tests/entrypoints/test_openai_chat.py
index 30455e720..1c46a5110 100644
--- a/tests/entrypoints/test_openai_chat.py
+++ b/tests/entrypoints/test_openai_chat.py
@@ -72,7 +72,7 @@ TEST_CHOICE = [
 pytestmark = pytest.mark.openai
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
diff --git a/tests/entrypoints/test_openai_completion.py b/tests/entrypoints/test_openai_completion.py
new file mode 100644
index 000000000..da5de3666
--- /dev/null
+++ b/tests/entrypoints/test_openai_completion.py
@@ -0,0 +1,650 @@
+# imports for guided decoding tests
+import json
+import re
+from typing import List
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
+import requests
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ..utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+TEST_CHOICE = [
+    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
+    "Swift", "Kotlin"
+]
+
+pytestmark = pytest.mark.openai
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init()
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, ray_ctx):
+    return RemoteOpenAIServer([
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ])
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=MODEL_NAME,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {TEST_SCHEMA}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in TEST_CHOICE
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI):
+    simple_sql_grammar = """
+start: select_statement
+
+select_statement: "SELECT" column "from" table "where" condition
+
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+
+number: "1" | "2"
+"""
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=simple_sql_grammar))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(simple_sql_grammar)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
+    base_url = str(client.base_url)[:-3]
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
+
+    for add_special in [False, True]:
+        prompt = "This is a test prompt."
+        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+        response = requests.post(base_url + "/tokenize",
+                                 json={
+                                     "add_special_tokens": add_special,
+                                     "model": model_name,
+                                     "prompt": prompt
+                                 })
+        response.raise_for_status()
+        assert response.json() == {
+            "tokens": tokens,
+            "count": len(tokens),
+            "max_model_len": 8192
+        }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
+    base_url = str(client.base_url)[:-3]
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
+
+    prompt = "This is a test prompt."
+    tokens = tokenizer.encode(prompt, add_special_tokens=False)
+
+    response = requests.post(base_url + "detokenize",
+                             json={
+                                 "model": model_name,
+                                 "tokens": tokens
+                             })
+    response.raise_for_status()
+    assert response.json() == {"prompt": prompt}
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 14f59ea66..ef0d30131 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1,20 +1,10 @@
-# imports for guided decoding tests
-import json
-import re
-from typing import List
-
-import jsonschema
 import openai  # use the official client for correctness check
 import pytest
 # using Ray for overall ease of process management, parallel requests,
 # and debugging.
 import ray
-import requests
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
-from openai import BadRequestError
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ..utils import RemoteOpenAIServer
 
@@ -24,57 +14,10 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-TEST_SCHEMA = {
-    "type": "object",
-    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
-        "skills": {
-            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
-        },
-        "work history": {
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "string"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
-                },
-                "required": ["company", "position"]
-            }
-        }
-    },
-    "required": ["name", "age", "skills", "work history"]
-}
-
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-TEST_CHOICE = [
-    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
-    "Swift", "Kotlin"
-]
-
 pytestmark = pytest.mark.openai
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
@@ -126,541 +69,3 @@ async def test_check_models(client: openai.AsyncOpenAI):
     assert all(model.root == MODEL_NAME for model in models)
     assert lora_models[0].id == "zephyr-lora"
     assert lora_models[1].id == "zephyr-lora2"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
-    prompt = "What is an LLM?"
-
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: List[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options={"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options={"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-                # for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-
-
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {TEST_SCHEMA}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_json=TEST_SCHEMA,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in TEST_CHOICE
-
-
-@pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI):
-    simple_sql_grammar = """
-start: select_statement
-
-select_statement: "SELECT" column "from" table "where" condition
-
-column: "col_1" | "col_2"
-table: "table_1" | "table_2"
-condition: column "=" number
-
-number: "1" | "2"
-"""
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_grammar=simple_sql_grammar))
-
-    content = completion.choices[0].text
-
-    # use Lark to parse the output, and make sure it's a valid parse tree
-    from lark import Lark
-    parser = Lark(simple_sql_grammar)
-    parser.parse(content)
-
-    # remove spaces for comparison b/c we removed them in the grammar
-    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
-
-    assert content.strip() == ground_truth
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # test using text and token IDs
-    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
-        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-        logprobs = completion.choices[0].logprobs
-        assert logprobs is not None
-        assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
-        for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
-        assert len(logprobs.tokens) > 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str):
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
-    base_url = str(client.base_url)[:-3]
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
-
-    for add_special in [False, True]:
-        prompt = "This is a test prompt."
-        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
-
-        response = requests.post(base_url + "/tokenize",
-                                 json={
-                                     "add_special_tokens": add_special,
-                                     "model": model_name,
-                                     "prompt": prompt
-                                 })
-        response.raise_for_status()
-        assert response.json() == {
-            "tokens": tokens,
-            "count": len(tokens),
-            "max_model_len": 8192
-        }
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
-    base_url = str(client.base_url)[:-3]
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
-
-    prompt = "This is a test prompt."
-    tokens = tokenizer.encode(prompt, add_special_tokens=False)
-
-    response = requests.post(base_url + "detokenize",
-                             json={
-                                 "model": model_name,
-                                 "tokens": tokens
-                             })
-    response.raise_for_status()
-    assert response.json() == {"prompt": prompt}
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
-- 
GitLab


From b90d8cd832669b9ad7c48cd9a431e80836778b56 Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Fri, 28 Jun 2024 08:20:22 -0700
Subject: [PATCH 187/376] [Distributed] Make it clear that % should not be in
 tensor dict keys. (#5927)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 tests/distributed/test_parallel_state.py | 10 +++++++++-
 vllm/distributed/parallel_state.py       |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py
index 5d293b2c1..3adcf6b61 100644
--- a/tests/distributed/test_parallel_state.py
+++ b/tests/distributed/test_parallel_state.py
@@ -1,5 +1,6 @@
 from typing import Any, Dict
 
+import pytest
 import torch
 
 from vllm.distributed.parallel_state import (_split_tensor_dict,
@@ -24,6 +25,14 @@ def test_split_tensor_dict():
     assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
 
 
+def test_split_tensor_dict_invalid_key():
+    test_dict = {
+        "a%b": "a",
+    }
+    with pytest.raises(AssertionError):
+        _split_tensor_dict(test_dict)
+
+
 def test_update_nested_dict():
     flattened_keys_values = [("key1%key2%key3", "value1"),
                              ("key1%key2%key4", "value2"),
@@ -31,7 +40,6 @@ def test_update_nested_dict():
                              ("key8", "value5")]
     res: Dict[str, Any] = {}
 
-    # Update the nested dictionary with each flattened key-value pair
     for flat_key, value in flattened_keys_values:
         _update_nested_dict(res, flat_key, value)
     assert res == {
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 51616cb0f..0c4ee0eb2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -58,6 +58,9 @@ def _split_tensor_dict(
     metadata_list: List[Tuple[str, Any]] = []
     tensor_list = []
     for key, value in tensor_dict.items():
+        assert "%" not in key, (
+            "Avoid having '%' in key "
+            "as it is used as a separator for nested entries.")
         if isinstance(value, torch.Tensor):
             # Note: we cannot use `value.device` here,
             # because it contains not only the device type but also the device
-- 
GitLab


From b2c620230a6efdc590b06b10f8e89f42362a150a Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 28 Jun 2024 09:17:51 -0700
Subject: [PATCH 188/376] [Spec Decode] Introduce DraftModelRunner (#5799)

---
 tests/spec_decode/test_multi_step_worker.py |   3 +
 tests/spec_decode/utils.py                  |   5 +-
 vllm/sequence.py                            |   3 +
 vllm/spec_decode/draft_model_runner.py      | 170 ++++++++++++++++++++
 vllm/spec_decode/multi_step_worker.py       |  29 ++--
 vllm/spec_decode/spec_decode_worker.py      |   3 +
 vllm/worker/cpu_model_runner.py             |  11 +-
 vllm/worker/embedding_model_runner.py       |  15 +-
 vllm/worker/model_runner.py                 |  10 +-
 vllm/worker/model_runner_base.py            |   3 +-
 vllm/worker/neuron_model_runner.py          |   9 +-
 vllm/worker/tpu_model_runner.py             |   9 +-
 vllm/worker/worker.py                       |   5 +-
 vllm/worker/worker_base.py                  |   9 +-
 vllm/worker/xpu_model_runner.py             |  11 +-
 15 files changed, 258 insertions(+), 37 deletions(-)
 create mode 100644 vllm/spec_decode/draft_model_runner.py

diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index a6eb628f9..7744b2640 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -7,6 +7,7 @@ import torch
 
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -85,6 +86,7 @@ def test_same_output_for_single_step():
         block_size,
         num_gpu_blocks,
         seed,
+        model_runner_cls=TP1DraftModelRunner,
     )
     worker = create_worker(
         Worker,
@@ -168,6 +170,7 @@ def test_same_output_for_multi_step():
         block_size,
         num_gpu_blocks,
         seed,
+        model_runner_cls=TP1DraftModelRunner,
     )
 
     worker = create_worker(
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index ce5b34783..68802f0b8 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -14,6 +14,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            SequenceOutput)
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.model_runner import ModelRunner
 from vllm.worker.worker import Worker
 
 T = TypeVar("T", bound=Worker)
@@ -66,7 +67,8 @@ def create_worker(cls: Callable[..., T],
                   num_gpu_blocks: int,
                   seed: int,
                   is_driver_worker: bool = True,
-                  enforce_eager: bool = True) -> T:
+                  enforce_eager: bool = True,
+                  model_runner_cls: Optional[ModelRunner] = None) -> T:
     engine_args = EngineArgs(
         model=model_name,
         seed=seed,
@@ -89,6 +91,7 @@ def create_worker(cls: Callable[..., T],
         rank=0,
         distributed_init_method=distributed_init_method,
         is_driver_worker=is_driver_worker,
+        model_runner_cls=model_runner_cls,
     )
 
     worker.init_device()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a50aaf420..13746cef2 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -880,6 +880,8 @@ class ExecuteModelRequest:
     running_queue_size: int = 0
     # Optional hidden states from prior step.
     previous_hidden_states: Optional[HiddenStates] = None
+    # The number of forward steps to run.
+    num_steps: int = 1
 
     def clone(
         self, seq_group_metadata_list: List[SequenceGroupMetadata]
@@ -893,4 +895,5 @@ class ExecuteModelRequest:
             num_lookahead_slots=self.num_lookahead_slots,
             running_queue_size=self.running_queue_size,
             previous_hidden_states=self.previous_hidden_states,
+            num_steps=self.num_steps,
         )
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
new file mode 100644
index 000000000..f30d29376
--- /dev/null
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -0,0 +1,170 @@
+from typing import List, Optional
+
+import torch
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
+                                      ModelRunner)
+
+logger = init_logger(__name__)
+
+
+class TP1DraftModelRunner(ModelRunner):
+    """Specialized model runner for speculative decoding draft model.
+    Since the draft model always execute k forward passes consecutively to
+    generate k speculative tokens in a single speculative decoding step,
+    we could get rid of most CPU-GPU synchronization and data transfer
+    overheads by keeping model input and output tensors on GPU all the time.
+
+    This runner is still under development so there's no performance gain
+    at this moment. Currently we adopt a temporary solution that caches the
+    seq_group_metadata_list for multi-step execution, so that we can
+    leverage existing prepare_model_input to be compatible with the current
+    execution flow, but we plan to remove this cache and avoid calling
+    prepare_model_input in execute_model at all.
+    
+    The detail development plan includes:
+    1. Use "update_model_input" to update existing model_input without
+       creating a new one.
+    2. Improve the performance of "update_model_input" with a GPU kernel.
+    3. Support TP > 1 (this requires some designs because we do not expect
+       any broadcasting inside execute_model).
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+        return_hidden_states: bool = False,
+    ):
+        if return_hidden_states:
+            raise ValueError(
+                "return_hidden_states is not supported for TP1DraftModelRunner."
+            )
+
+        super().__init__(
+            model_config=model_config,
+            parallel_config=parallel_config,
+            scheduler_config=scheduler_config,
+            device_config=device_config,
+            cache_config=cache_config,
+            load_config=load_config,
+            lora_config=lora_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+            vision_language_config=vision_language_config,
+            return_hidden_states=return_hidden_states,
+        )
+
+        # TODO: Remove this cache when we are able to update model_input
+        # directly in advance_step.
+        self.cached_seq_group_metadata_list: Optional[
+            List[SequenceGroupMetadata]] = None
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """A temporary solution that caches the seq_group_metadata_list
+        for multi-step execution.
+        TODO: In-place update model_input and remove this function.
+        """
+        self.cached_seq_group_metadata_list = seq_group_metadata_list
+        return super().prepare_model_input(seq_group_metadata_list)
+
+    def update_model_input(
+            self, model_input: ModelInputForGPUWithSamplingMetadata,
+            last_output: SamplerOutput
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """Prepare the model inputs for the next step.
+        TODO: In-place update model_input instead of calling
+        prepare_model_input.
+        """
+
+        # Append the output token to the sequence data.
+        assert self.cached_seq_group_metadata_list is not None
+        for seq_group_metadata, sequence_group_outputs in zip(
+                self.cached_seq_group_metadata_list, last_output.outputs):
+            seq_group_metadata.is_prompt = False
+
+            for seq_output in sequence_group_outputs.samples:
+                seq = seq_group_metadata.seq_data[seq_output.parent_seq_id]
+
+                token_id = seq_output.output_token
+                token_logprob = seq_output.logprobs[token_id]
+
+                seq.append_token_id(token_id, token_logprob.logprob)
+                seq.update_num_computed_tokens(1)
+
+        return self.prepare_model_input(self.cached_seq_group_metadata_list)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        # Since we do not broadcast data inside execute_model anymore,
+        # we need to figure out the best way to support TP > 1 in this
+        # case, because we will at least need to broadcast the sampled
+        # tokens to all workers.
+        if not self.is_driver_worker:
+            raise ValueError("TP1DraftModelRunner only supports TP=1.")
+
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
+        outputs: List[SamplerOutput] = []
+        for step in range(num_steps):
+            # Currently cuda graph is only supported by the decode phase.
+            assert model_input.attn_metadata is not None
+            prefill_meta = model_input.attn_metadata.prefill_metadata
+            decode_meta = model_input.attn_metadata.decode_metadata
+            if prefill_meta is None and decode_meta.use_cuda_graph:
+                assert model_input.input_tokens is not None
+                graph_batch_size = model_input.input_tokens.shape[0]
+                model_executable = self.graph_runners[graph_batch_size]
+            else:
+                model_executable = self.model
+
+            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+            hidden_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                **multi_modal_kwargs,
+            )
+
+            # Compute the logits.
+            logits = self.model.compute_logits(hidden_states,
+                                               model_input.sampling_metadata)
+
+            # Sample the next token.
+            outputs.append(
+                self.model.sample(
+                    logits=logits,
+                    sampling_metadata=model_input.sampling_metadata,
+                ))
+
+            # Prepare the inputs for the next step.
+            if step != num_steps - 1:
+                model_input = self.update_model_input(model_input, outputs[-1])
+
+        return outputs
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index e469fd7c3..c1a02e1d3 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -6,6 +6,7 @@ import torch
 
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
@@ -67,22 +68,24 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
         copied_execute_model_req = execute_model_req.clone(
             copied_seq_group_metadata_list)
 
-        # Assert enough KV space for sample_len tokens per sequence.
-        self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list,
-                                     sample_len)
-
         # Run model sample_len times.
         model_outputs: List[SamplerOutput] = []
-        for _ in range(sample_len):
-            model_output: List[SamplerOutput] = super().execute_model(
+        if isinstance(self.model_runner, TP1DraftModelRunner):
+            copied_execute_model_req.num_steps = sample_len
+            model_outputs = self.execute_model(
                 execute_model_req=copied_execute_model_req)
-            assert (len(model_output) == 1
-                    ), "composing multistep workers not supported"
-            model_output = model_output[0]
-
-            self._append_new_tokens(model_output,
-                                    copied_seq_group_metadata_list)
-            model_outputs.append(model_output)
+        else:
+            # TODO: Remove this branch once DraftModelRunner supports TP>1.
+            for _ in range(sample_len):
+                model_output: List[SamplerOutput] = super().execute_model(
+                    execute_model_req=copied_execute_model_req)
+                assert (len(model_output) == 1
+                        ), "composing multistep workers not supported"
+                model_output = model_output[0]
+
+                self._append_new_tokens(model_output,
+                                        copied_seq_group_metadata_list)
+                model_outputs.append(model_output)
 
         return model_outputs, True
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 5089e3dd5..f1e64cae8 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -11,6 +11,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
                            HiddenStates, SamplerOutput, SequenceGroupMetadata,
                            get_all_seq_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.metrics import AsyncMetricsCollector
@@ -117,6 +118,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             draft_tp = draft_parallel_config.tensor_parallel_size
             target_tp = scorer_worker.parallel_config.tensor_parallel_size
 
+            if draft_tp == 1:
+                draft_worker_kwargs["model_runner_cls"] = TP1DraftModelRunner
             proposer_worker = MultiStepWorker(**draft_worker_kwargs)
             proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
                 proposer_worker, draft_tp, target_tp)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index e689f485e..b83cc6f09 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -351,7 +351,12 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
         self,
         model_input: CPUModelInput,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "CPU worker does not support multi-step execution.")
+
         model_executable = self.model
         execute_model_kwargs = {
             "input_ids": model_input.input_tokens,
@@ -371,11 +376,11 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
-            return None
+            return []
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 3c8dfa2c6..272917c72 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -57,7 +57,12 @@ class EmbeddingModelRunner(
         self,
         model_input: ModelInputForGPUWithPoolingMetadata,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[PoolerOutput]:
+        num_steps: int = 1,
+    ) -> Optional[List[PoolerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "EmbeddingModelRunner does not support multi-step execution.")
+
         if self.lora_config:
             assert model_input.lora_requests is not None
             assert model_input.lora_mapping is not None
@@ -91,10 +96,12 @@ class EmbeddingModelRunner(
 
         # Only perform pooling in the driver worker.
         if not self.is_driver_worker:
-            return None
+            return []
 
-        return self.model.pooler(hidden_states=hidden_states,
-                                 pooling_metadata=model_input.pooling_metadata)
+        return [
+            self.model.pooler(hidden_states=hidden_states,
+                              pooling_metadata=model_input.pooling_metadata)
+        ]
 
     def make_model_input_from_broadcasted_tensor_dict(
             self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 93a10070d..082166030 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -959,7 +959,11 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
         self,
         model_input: ModelInputForGPUWithSamplingMetadata,
         kv_caches: List[torch.Tensor],
-    ) -> SamplerOutput:
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError("num_steps > 1 is not supported in ModelRunner")
+
         if self.lora_config:
             assert model_input.lora_requests is not None
             assert model_input.lora_mapping is not None
@@ -992,7 +996,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
-            return None
+            return []
 
         # Sample the next token.
         output: SamplerOutput = self.model.sample(
@@ -1011,7 +1015,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
 
             output.hidden_states = hidden_states
 
-        return output
+        return [output]
 
 
 class CUDAGraphRunner:
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 9b1706035..959cfc0b9 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -150,7 +150,8 @@ class ModelRunnerBase(ABC, Generic[T]):
         self,
         model_input: T,
         kv_caches: Optional[List[torch.Tensor]],
-    ) -> Optional[SamplerOutput]:
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
         """
         Execute the model on the given input.
         """
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index fec2c97e7..2ccf4a50a 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -207,7 +207,12 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         self,
         model_input: ModelInputForNeuron,
         kv_caches: Optional[List[torch.Tensor]] = None,
-    ) -> Optional[SamplerOutput]:
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "NeuronModelRunner does not support multi-step execution.")
+
         hidden_states = self.model(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
@@ -223,7 +228,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
 
     @property
     def vocab_size(self) -> int:
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index c3ccbd025..7827f7c74 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -444,7 +444,12 @@ class TPUModelRunner:
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> SamplerOutput:
+        num_steps: int = 1,
+    ) -> List[SamplerOutput]:
+        if num_steps > 1:
+            raise ValueError(
+                "TPUModelRunner does not support multi-step execution.")
+
         assert seq_group_metadata_list is not None
         assert len(seq_group_metadata_list) > 0
         if seq_group_metadata_list[0].is_prompt:
@@ -462,7 +467,7 @@ class TPUModelRunner:
         else:
             sampler_outputs = self._execute_model(seq_group_metadata_list,
                                                   kv_caches)
-        return SamplerOutput(sampler_outputs)
+        return [SamplerOutput(sampler_outputs)]
 
 
 class ModelWrapper(nn.Module):
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index e1944a4f1..156d5278a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -45,6 +45,7 @@ class Worker(LocalOrDistributedWorkerBase):
         vision_language_config: Optional[VisionLanguageConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
         is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -78,7 +79,9 @@ class Worker(LocalOrDistributedWorkerBase):
                   "mlp_speculator") else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
-        if self.model_config.embedding_mode:
+        if model_runner_cls is not None:
+            ModelRunnerClass = model_runner_cls
+        elif self.model_config.embedding_mode:
             ModelRunnerClass = EmbeddingModelRunner
         self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
             model_config,
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 1df60eb1f..d867e15bd 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -228,11 +228,13 @@ class LocalOrDistributedWorkerBase(WorkerBase):
             model_input: ModelRunnerInputBase = (
                 self.model_runner.prepare_model_input(
                     execute_model_req.seq_group_metadata_list))
+            num_steps = execute_model_req.num_steps
 
             if self.do_metadata_broadcast:
                 broadcast_data = worker_input.as_broadcastable_tensor_dict()
                 broadcast_data.update(
                     model_input.as_broadcastable_tensor_dict())
+                broadcast_data["num_steps"] = num_steps
                 broadcast_tensor_dict(broadcast_data, src=0)
         else:
             assert self.do_metadata_broadcast
@@ -240,6 +242,7 @@ class LocalOrDistributedWorkerBase(WorkerBase):
             if not broadcast_data:
                 return None
 
+            num_steps = broadcast_data.pop("num_steps")
             worker_input = WorkerInput.from_broadcasted_tensor_dict(
                 broadcast_data)
             model_input = (
@@ -252,10 +255,8 @@ class LocalOrDistributedWorkerBase(WorkerBase):
         if worker_input.num_seq_groups == 0:
             return []
 
-        output = self.model_runner.execute_model(model_input, self.kv_cache)
-        # Worker only supports single-step execution. Wrap the output in a
-        # list to conform to interface.
-        return [output]
+        return self.model_runner.execute_model(model_input, self.kv_cache,
+                                               num_steps)
 
 
 class WorkerWrapperBase:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index d9124a788..99fd7da5e 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -334,7 +334,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         self,
         model_input: ModelInputForXPU,
         kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "XPUModelRunner does not support multi-step execution.")
+
         model_executable = self.model
         execute_model_kwargs = {
             "input_ids": model_input.input_tokens,
@@ -354,14 +359,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
-            return None
+            return []
 
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
-        return output
+        return [output]
 
     def _prepare_prompt(
         self,
-- 
GitLab


From 6a2d659d28a9f9f3edbfbae138915d147a9fe79c Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 28 Jun 2024 13:10:34 -0400
Subject: [PATCH 189/376] [Bugfix] Fix compute datatype for cutlass 3.x
 epilogues (#5931)

---
 .../cutlass_w8a8/scaled_mm_c3x.cu             |   4 +-
 tests/kernels/test_cutlass.py                 | 125 ++++++++++--------
 2 files changed, 70 insertions(+), 59 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 326ec02ca..b3f5b6208 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -144,14 +144,14 @@ struct ScaledEpilogueBias
   using ScaleB = typename SUPER::ScaleB;
 
   using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, ElementD, ElementD,
+      cutlass::multiplies, float, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
   using EVTCompute0 =
       cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
 
   using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, ElementD,
+      cutlass::multiply_add, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
   using BiasDescriptor =
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 39de444be..d8e6d27b8 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -2,7 +2,7 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Type
+from typing import Optional, Type
 
 import pytest
 import torch
@@ -27,12 +27,27 @@ def to_int8(tensor: torch.Tensor):
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
 def cutlass_fp8_gemm_helper(m: int,
                             n: int,
                             k: int,
                             per_token_act_quant: bool,
                             per_out_channel_weight_quant: bool,
-                            bias: bool,
+                            use_bias: bool,
                             out_dtype: Type[torch.dtype] = torch.bfloat16,
                             device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
@@ -43,23 +58,19 @@ def cutlass_fp8_gemm_helper(m: int,
     m_a_scales = m if per_token_act_quant else 1
     n_b_scales = n if per_out_channel_weight_quant else 1
 
-    scale_a = (torch.randn(
-        (m_a_scales, 1), device=device, dtype=torch.float32) / 10)
-    scale_b = (torch.randn(
-        (1, n_b_scales), device=device, dtype=torch.float32) / 10)
-    if bias:
-        # bias term should be > 1 so that the absolute tolerance can catch it
-        bias_t = torch.rand((n, ), device=device, dtype=out_dtype) + 1.0
-        out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias_t)
+    scale_a = (torch.randn((m_a_scales, 1), device=device,
+                           dtype=torch.float32))
+    scale_b = (torch.randn((1, n_b_scales), device=device,
+                           dtype=torch.float32))
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
     else:
-        out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
-        bias_t = 0
+        bias = None
 
-    baseline = (torch.mm(scale_a * a.to(dtype=torch.float32),
-                         scale_b * b.to(dtype=torch.float32)) +
-                bias_t).to(out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
-    assert torch.allclose(out, baseline, rtol=1e-2, atol=1e-1)
+    assert torch.allclose(out, baseline, rtol=1e-2, atol=5e-2)
 
 
 def cutlass_int8_gemm_helper(m: int,
@@ -67,7 +78,7 @@ def cutlass_int8_gemm_helper(m: int,
                              k: int,
                              per_token_act_quant: bool,
                              per_out_channel_weight_quant: bool,
-                             bias: bool,
+                             use_bias: bool,
                              out_dtype: Type[torch.dtype] = torch.bfloat16,
                              device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
@@ -78,22 +89,19 @@ def cutlass_int8_gemm_helper(m: int,
     m_a_scales = m if per_token_act_quant else 1
     n_b_scales = n if per_out_channel_weight_quant else 1
 
-    scale_a = (torch.randn(
-        (m_a_scales, 1), device=device, dtype=torch.float32) / 10)
-    scale_b = (torch.randn(
-        (1, n_b_scales), device=device, dtype=torch.float32) / 10)
+    scale_a = (torch.randn((m_a_scales, 1), device=device,
+                           dtype=torch.float32))
+    scale_b = (torch.randn((1, n_b_scales), device=device,
+                           dtype=torch.float32))
 
-    if bias:
-        # bias term should be > 1 so that the absolute tolerance can catch it
-        bias_t = torch.rand((n, ), device=device, dtype=out_dtype) + 1.0
-        out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias_t)
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
     else:
-        out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
-        bias_t = 0
+        bias = None
+
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
-    baseline = (torch.mm(scale_a * a.to(dtype=torch.float32),
-                         scale_b * b.to(dtype=torch.float32)) +
-                bias_t).to(dtype=out_dtype)
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
 
@@ -102,12 +110,12 @@ def cutlass_int8_gemm_helper(m: int,
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                          per_out_ch: bool, bias: bool):
-    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, bias)
+                          per_out_ch: bool, use_bias: bool):
+    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
 @pytest.mark.parametrize("m", [512, 222, 33, 1])
@@ -115,70 +123,70 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                           per_out_ch: bool, bias: bool):
-    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, bias)
+                           per_out_ch: bool, use_bias: bool):
+    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
                                         out_dtype: Type[torch.dtype],
-                                        bias: bool):
+                                        use_bias: bool):
     cutlass_int8_gemm_helper(512,
                              512,
                              512,
                              per_act_token,
                              per_out_ch,
-                             bias,
+                             use_bias,
                              out_dtype=out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
                                        out_dtype: Type[torch.dtype],
-                                       bias: bool):
+                                       use_bias: bool):
     cutlass_fp8_gemm_helper(512,
                             512,
                             512,
                             per_act_token,
                             per_out_ch,
-                            bias,
+                            use_bias,
                             out_dtype=out_dtype)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
-                                  bias: bool, device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, bias,
+                                  use_bias: bool, device: str):
+    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
                             torch.bfloat16, device)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
-                                   bias: bool, device: str):
+                                   use_bias: bool, device: str):
     cutlass_int8_gemm_helper(512,
                              512,
                              512,
                              per_act_token,
                              per_out_ch,
-                             bias,
+                             use_bias,
                              out_dtype=torch.bfloat16,
                              device=device)
 
@@ -190,25 +198,26 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
 # kernel must handle any M thrown at it.
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(capability < 89,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
-                                  bias: bool):
+                                  use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch, bias)
+            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
+                                    use_bias)
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
-                                   bias: bool):
+                                   use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
             cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
-                                     bias)
+                                     use_bias)
 
 
 # Test working with a subset of A and B
@@ -229,9 +238,11 @@ def test_cutlass_subset():
                                 scale_a,
                                 scale_b,
                                 out_dtype=torch.bfloat16)
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b *
-                        b.to(dtype=torch.float32)).to(dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
 
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
-- 
GitLab


From b185230744ae36a612527a9864c27f685acc6ef3 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 28 Jun 2024 13:49:57 -0400
Subject: [PATCH 190/376] [ Misc ] Remove `fp8_shard_indexer` from Col/Row
 Parallel Linear (Simplify Weight Loading) (#5928)

Co-authored-by: Robert Shaw <rshaw@neuralmagic>
---
 vllm/model_executor/layers/linear.py | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 45f805547..fe7c2a295 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -269,10 +269,6 @@ class ColumnParallelLinear(LinearBase):
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
-
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
         param_data = param.data
@@ -281,11 +277,11 @@ class ColumnParallelLinear(LinearBase):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
-                                                                 loaded_weight,
-                                                                 shard_id=0)
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -751,10 +747,6 @@ class RowParallelLinear(LinearBase):
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
-
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
         param_data = param.data
@@ -764,13 +756,9 @@ class RowParallelLinear(LinearBase):
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
 
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
-                                                                 loaded_weight,
-                                                                 shard_id=0)
-
-        if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0:
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
         assert param_data.shape == loaded_weight.shape
-- 
GitLab


From 2cd402e1692417b7645e4ece11bc2ab91072f47c Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 28 Jun 2024 14:43:49 -0400
Subject: [PATCH 191/376] [ Bugfix ] Enabling Loading Models With Fused QKV/MLP
 on Disk with FP8 (#5921)

Co-authored-by: Robert Shaw <rshaw@neuralmagic>
---
 vllm/model_executor/layers/linear.py          | 14 ++++++-
 .../model_executor/layers/quantization/fp8.py | 41 +++++++++----------
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index fe7c2a295..d221fecd6 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -383,8 +383,13 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                                            None)
 
         if loaded_shard_id is None:
-            # Loaded weight is already packed.
+            # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
+                # If fp8 + scale, need to send to each shard.
+                if fp8_scales_shard_indexer is not None:
+                    param_data, loaded_weight = fp8_scales_shard_indexer(
+                        param_data, loaded_weight, loaded_shard_id)
+
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
@@ -567,8 +572,13 @@ class QKVParallelLinear(ColumnParallelLinear):
                                            None)
 
         if loaded_shard_id is None:
-            # Loaded weight is already packed.
+            # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
+                # If fp8 + scale, need to send to each shard.
+                if fp8_scales_shard_indexer is not None:
+                    param_data, loaded_weight = fp8_scales_shard_indexer(
+                        param_data, loaded_weight, loaded_shard_id)
+
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
                 return
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bbf3cde54..1c760566c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -98,6 +98,7 @@ class Fp8LinearMethod(LinearMethodBase):
     """
 
     def __init__(self, quant_config: Fp8Config):
+        self.fused_module_in_checkpoint = False
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
 
@@ -111,6 +112,7 @@ class Fp8LinearMethod(LinearMethodBase):
         scale = Parameter(torch.empty(len(output_partition_sizes),
                                       dtype=torch.float32),
                           requires_grad=False)
+        scale[:] = torch.finfo(torch.float8_e4m3fn).min
         layer.register_parameter(scale_name, scale)
         set_weight_attrs(
             scale, {
@@ -169,11 +171,15 @@ class Fp8LinearMethod(LinearMethodBase):
                     **extra_weight_attrs)
 
     def scales_shard_indexer(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]:
+        self, param: torch.Tensor, loaded_weight: torch.Tensor,
+        shard_id: Optional[Union[str,
+                                 int]]) -> Tuple[torch.Tensor, torch.Tensor]:
         qkv_idxs = {"q": 0, "k": 1, "v": 2}
 
-        if isinstance(shard_id, int):
+        if shard_id is None:
+            shard_id = 0
+            self.fused_module_in_checkpoint = True
+        elif isinstance(shard_id, int):
             pass
         elif isinstance(shard_id, str):
             if shard_id not in qkv_idxs:
@@ -205,15 +211,17 @@ class Fp8LinearMethod(LinearMethodBase):
             # WEIGHT_SCALE / WEIGHT
             #   Loop over logical weights, requantizing with single scale.
             max_w_scale = layer.weight_scale.max()
-            start = 0
-            for idx, logical_width in enumerate(layer.logical_widths):
-                end = start + logical_width
-                weight_dq = per_tensor_dequantize(layer.weight[start:end, :],
-                                                  layer.weight_scale[idx])
-
-                layer.weight[start:end, :] = per_tensor_quantize(
-                    weight_dq, layer.weight_scale.max())
-                start = end
+
+            if not self.fused_module_in_checkpoint:
+                start = 0
+                for idx, logical_width in enumerate(layer.logical_widths):
+                    end = start + logical_width
+                    weight_dq = per_tensor_dequantize(
+                        layer.weight[start:end, :], layer.weight_scale[idx])
+
+                    layer.weight[start:end, :] = per_tensor_quantize(
+                        weight_dq, layer.weight_scale.max())
+                    start = end
             layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
 
             # WEIGHT
@@ -227,10 +235,6 @@ class Fp8LinearMethod(LinearMethodBase):
             if self.quant_config.activation_scheme == "dynamic":
                 layer.input_scale = None
             elif self.quant_config.activation_scheme == "static":
-                if not all_close_1d(layer.input_scale):
-                    raise ValueError(
-                        "All the input_scales for the logical weights of a "
-                        f"layer must be equal. But got {layer.input_scale}")
                 layer.input_scale = Parameter(layer.input_scale.max(),
                                               requires_grad=False)
             else:
@@ -317,11 +321,6 @@ class Fp8KVCacheMethod(QuantizeMethodBase):
         del layer.kv_scale
 
 
-def all_close_1d(x: torch.Tensor) -> bool:
-    assert len(x.shape) == 1
-    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
-
-
 def per_tensor_quantize(tensor: torch.Tensor,
                         inv_scale: Union[float, torch.Tensor]) -> torch.Tensor:
     finfo = torch.finfo(torch.float8_e4m3fn)
-- 
GitLab


From be0b3af9e068418726fa2b1dccef966e39024fd5 Mon Sep 17 00:00:00 2001
From: wangding zeng <155410488+zwd003@users.noreply.github.com>
Date: Sat, 29 Jun 2024 04:24:57 +0800
Subject: [PATCH 192/376] Support Deepseek-V2 (#4650)

Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
---
 vllm/config.py                                |   6 +
 .../layers/fused_moe/__init__.py              |   3 +-
 .../layers/fused_moe/fused_moe.py             |  31 +
 .../model_executor/layers/rotary_embedding.py | 126 +++++
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/deepseek_v2.py     | 534 ++++++++++++++++++
 6 files changed, 700 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/deepseek_v2.py

diff --git a/vllm/config.py b/vllm/config.py
index 05bc57062..3551e8f6f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -297,6 +297,12 @@ class ModelConfig:
         return self.hf_text_config.hidden_size
 
     def get_head_size(self) -> int:
+        # TODO remove hard code
+        if hasattr(self.hf_text_config, "model_type"
+                   ) and self.hf_text_config.model_type == 'deepseek_v2':
+            # FlashAttention supports only head_size 32, 64, 128, 256,
+            # we need to pad head_size 192 to 256
+            return 256
         if hasattr(self.hf_text_config, "head_dim"):
             return self.hf_text_config.head_dim
         # FIXME(woosuk): This may not be true for all models.
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 2926c7d1c..1dafae503 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,9 +1,10 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_experts, fused_moe, fused_topk, get_config_file_name)
+    fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
 
 __all__ = [
     "fused_moe",
     "fused_topk",
     "fused_experts",
     "get_config_file_name",
+    "grouped_topk",
 ]
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4d0160ff2..b750fc713 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -367,6 +367,37 @@ def fused_topk(
     return topk_weights, topk_ids
 
 
+# This is used by the Deepseek-V2 model
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+):
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = scores.view(num_token, num_expert_group,
+                               -1).max(dim=-1).values  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
+                           sorted=False)[1]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores,
+                                        k=topk,
+                                        dim=-1,
+                                        sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 9e53deef0..1285627ec 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -610,6 +610,119 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         return query.flatten(-2), key.flatten(-2)
 
 
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale)) /
+            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
+            attn_factor)
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
+                                self.rotary_dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
+                                                self.rotary_dim, self.base,
+                                                self.max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (1 - _yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2,
+            dtype=torch.float)) * self.extrapolation_factor
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
+                         device="cuda",
+                         dtype=torch.float32)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = (freqs.cos() * self.mscale)
+        sin = (freqs.sin() * self.mscale)
+        cache = torch.cat((cos, sin), dim=-1)
+        print("Cache shape", cache.shape)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward()."""
+        query_rot = query[..., :self.rotary_dim]
+        key_rot = key[..., :self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim:]
+            key_pass = key[..., self.rotary_dim:]
+
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+            positions.device)
+        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
+                                     if offsets is not None else positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
+
+
 class GemmaRotaryEmbedding(RotaryEmbedding):
 
     def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
@@ -679,6 +792,19 @@ def get_rope(
                                                     base, is_neox_style,
                                                     scaling_factor, dtype,
                                                     **extra_kwargs)
+        elif scaling_type == "deepseek_yarn":
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            # assert max_position == original_max_position * scaling_factor
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
+                         "beta_slow", "mscale", "mscale_all_dim")
+            }
+            rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size, rotary_dim, original_max_position, base,
+                is_neox_style, scaling_factor, dtype, **extra_kwargs)
         # The correct one should be "longrope" but keep "su" here
         # for backward compatible
         elif scaling_type == "su" or scaling_type == "longrope":
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index e7ced618c..69a65ff02 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -21,6 +21,7 @@ _GENERATION_MODELS = {
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
+    "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
new file mode 100644
index 000000000..3d4f78c66
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -0,0 +1,534 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only DeepseekV2 model."""
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_experts, grouped_topk
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+
+
+class DeepseekV2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekV2MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.n_routed_experts = config.n_routed_experts
+        self.top_k = config.num_experts_per_tok
+        self.routed_scaling_factor = config.routed_scaling_factor
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}.")
+
+        self.experts = nn.ModuleList([
+            DeepseekV2MLP(hidden_size=config.hidden_size,
+                          intermediate_size=config.moe_intermediate_size,
+                          hidden_act=config.hidden_act,
+                          quant_config=quant_config,
+                          reduce_results=False)
+            for idx in range(self.n_routed_experts)
+        ])
+        self.pack_params()
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     self.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+
+    def pack_params(self):
+        w1 = []
+        w2 = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.config.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_weights, topk_ids = grouped_topk(
+            hidden_states,
+            router_logits,
+            self.top_k,
+            renormalize=self.config.norm_topk_prob,
+            num_expert_group=self.config.n_group,
+            topk_group=self.config.topk_group)
+        final_hidden_states = fused_experts(
+            hidden_states,
+            self.w1,
+            self.w2,
+            topk_weights,
+            topk_ids,
+            inplace=True) * self.routed_scaling_factor
+        if self.config.n_shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = tensor_model_parallel_all_reduce(
+            final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    import math
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV2Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx=None,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config)
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config)
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config)
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size,
+                                                   self.kv_lora_rank +
+                                                   self.qk_rope_head_dim,
+                                                   bias=False,
+                                                   quant_config=quant_config)
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config)
+        # O projection.
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+        rope_scaling['type'] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        # self.attn = Attention(self.num_heads,
+        #                       self.qk_head_dim,
+        #                       self.scaling,
+        #                       num_kv_heads=self.num_heads)
+
+        # TODO, support head_size 192
+        self.attn = Attention(self.num_local_heads,
+                              256,
+                              self.scaling,
+                              num_kv_heads=self.num_local_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
+                                         self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
+                                                   self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                               dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim:] = q_pe
+        k = torch.empty_like(q)
+        k[..., :self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim:] = k_pe
+        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = attn_output.view(
+            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+                -1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekV2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = DeepseekV2Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank
+            if hasattr(config, "q_lora_rank") else None,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            layer_idx=layer_idx,
+        )
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config)
+        else:
+            self.mlp = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeepseekV2Model(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            DeepseekV2DecoderLayer(config,
+                                   layer_idx,
+                                   cache_config=cache_config,
+                                   quant_config=quant_config)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i], attn_metadata,
+                                            residual)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekV2ForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekV2Model(config, cache_config, quant_config)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
-- 
GitLab


From 4bf35ed9ae3cb2ee2efd8f5b9ced620ca9836240 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 28 Jun 2024 17:12:40 -0400
Subject: [PATCH 193/376] [Bugfix] Only add `Attention.kv_scale` if kv cache
 quantization is enabled (#5936)

---
 vllm/attention/layer.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index db55a3147..dfe93be46 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -9,6 +9,7 @@ from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
 
 
 class Attention(nn.Module):
@@ -56,15 +57,19 @@ class Attention(nn.Module):
         quant_method = quant_config.get_quant_method(
             self) if quant_config else None
         if quant_method is not None:
-            if self.kv_cache_dtype == "fp8_e5m2":
-                raise ValueError("fp8_e5m2 kv-cache is not supported with "
-                                 "fp8 checkpoints.")
-            # When FP8 quantization is enabled, we make a parameter
-            # "kv_scale" so that it can be loaded from FP8 checkpoint.
-            # The kv_scale will then be converted back
-            # to self._kv_scale in a native float32 value after weight loading.
-            self.quant_method = quant_method
-            self.quant_method.create_weights(self)
+            assert isinstance(quant_method, Fp8KVCacheMethod)
+            # TODO (mgoin): kv cache dtype should be specified in the FP8
+            # checkpoint config and become the "auto" behavior
+            if "fp8" in self.kv_cache_dtype:
+                if self.kv_cache_dtype == "fp8_e5m2":
+                    raise ValueError("fp8_e5m2 kv-cache is not supported with "
+                                     "fp8 checkpoints.")
+                # When FP8 quantization is enabled, we make a parameter
+                # "kv_scale" so that it can be loaded from FP8 checkpoint.
+                # The kv_scale will then be converted back to self._kv_scale
+                # in a native float32 value after weight loading.
+                self.quant_method = quant_method
+                self.quant_method.create_weights(self)
 
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
-- 
GitLab


From 5d2a1a9cf0f47ac4f1676c17a835a05d4b4e4175 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 28 Jun 2024 17:34:56 -0400
Subject: [PATCH 194/376] Unmark more files as executable (#5962)

---
 csrc/punica/bgmv/bgmv_config.h         | 0
 examples/offline_inference_neuron.py   | 0
 vllm/model_executor/models/__init__.py | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 csrc/punica/bgmv/bgmv_config.h
 mode change 100755 => 100644 examples/offline_inference_neuron.py
 mode change 100755 => 100644 vllm/model_executor/models/__init__.py

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
old mode 100755
new mode 100644
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
old mode 100755
new mode 100644
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
old mode 100755
new mode 100644
-- 
GitLab


From 6a62cb82ccace1209f7b8bbec95025e047a95ded Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 28 Jun 2024 17:46:30 -0400
Subject: [PATCH 195/376] [Bugfix] Fix Engine Failing After Invalid Request -
 AsyncEngineDeadError (#5963)

Co-authored-by: Robert Shaw <rshaw@neuralmagic>
---
 vllm/entrypoints/openai/protocol.py | 34 ++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 7fb1af158..0ad46cbea 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -234,15 +234,22 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
         logits_processors = None
         if self.logit_bias:
+            logit_bias: Dict[int, float] = {}
+            try:
+                for token_id, bias in self.logit_bias.items():
+                    # Convert token_id to integer before we add to LLMEngine
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    logit_bias[int(token_id)] = min(100, max(-100, bias))
+            except ValueError as exc:
+                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
+                                 f"but token_id must be an integer or string "
+                                 f"representing an integer") from exc
 
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
-                assert self.logit_bias is not None
-                for token_id, bias in self.logit_bias.items():
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    bias = min(100, max(-100, bias))
-                    logits[int(token_id)] += bias
+                for token_id, bias in logit_bias.items():
+                    logits[token_id] += bias
                 return logits
 
             logits_processors = [logit_bias_logits_processor]
@@ -419,15 +426,22 @@ class CompletionRequest(OpenAIBaseModel):
 
         logits_processors = None
         if self.logit_bias:
+            logit_bias: Dict[int, float] = {}
+            try:
+                for token_id, bias in self.logit_bias.items():
+                    # Convert token_id to integer
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    logit_bias[int(token_id)] = min(100, max(-100, bias))
+            except ValueError as exc:
+                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
+                                 f"but token_id must be an integer or string "
+                                 f"representing an integer") from exc
 
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
-                assert self.logit_bias is not None
-                for token_id, bias in self.logit_bias.items():
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    bias = min(100, max(-100, bias))
-                    logits[int(token_id)] += bias
+                for token_id, bias in logit_bias.items():
+                    logits[token_id] += bias
                 return logits
 
             logits_processors = [logit_bias_logits_processor]
-- 
GitLab


From 7041de43849fda7c8e931f0726f3db2a0d8015a4 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Fri, 28 Jun 2024 15:28:49 -0700
Subject: [PATCH 196/376] [Kernel] Flashinfer for prefill & decode, with
 Cudagraph support for decode (#4628)

Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com>, bong-furiosa <bongwon.jang@furiosa.ai>
---
 .buildkite/test-pipeline.yaml                 |   3 +
 requirements-test.txt                         |   2 +-
 .../test_basic_correctness.py                 |   6 -
 .../test_basic_distributed_correctness.py     |   5 -
 vllm/attention/backends/flashinfer.py         |  83 +++--
 vllm/attention/selector.py                    |   5 +-
 vllm/worker/model_runner.py                   | 326 +++++++++++++-----
 7 files changed, 313 insertions(+), 117 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index fa37d0c75..023696f3c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -211,3 +211,6 @@ steps:
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index 8b68e0e93..3ebfc1654 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -19,4 +19,4 @@ sentence-transformers # required for embedding
 aiohttp
 
 # quantization
-bitsandbytes==0.42.0
+bitsandbytes==0.42.0
\ No newline at end of file
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 805b8883b..6f44030fe 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -2,7 +2,6 @@
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
-import os
 import weakref
 
 import pytest
@@ -13,7 +12,6 @@ MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
 def test_vllm_gc_ed():
@@ -39,10 +37,6 @@ def test_models(
     max_tokens: int,
     enforce_eager: bool,
 ) -> None:
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
-        pytest.skip("Skipping non-eager test for FlashInferBackend.")
-
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index eb423aef2..b8ae5b4c4 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -21,7 +21,6 @@ MODELS = [
     os.environ["TEST_DIST_MODEL"],
 ]
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -39,16 +38,12 @@ def test_models(
 ) -> None:
     distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
 
-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    enforce_eager = backend_by_env_var == "FLASHINFER"
-
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model,
                      dtype=dtype,
                      tensor_parallel_size=2,
-                     enforce_eager=enforce_eager,
                      distributed_executor_backend=distributed_executor_backend
                      ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 535d30b55..4ecac7379 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,10 +1,16 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Set, Tuple, Type
 
-import flashinfer
+try:
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+    from vllm_flash_attn import flash_attn_varlen_func
+except ImportError:
+    flash_attn_varlen_func = None
+    BatchDecodeWithPagedKVCacheWrapper = None
+    BatchPrefillWithPagedKVCacheWrapper = None
+
 import torch
-from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-from vllm_flash_attn import flash_attn_varlen_func
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
@@ -60,19 +66,16 @@ class FlashInferMetadata(AttentionMetadata):
     # requests only.
     max_prefill_seq_len: int
 
-    use_cuda_graph: bool = False
+    use_cuda_graph: bool = True
 
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
 
-    # Metadata for the prefill stage since we still
-    # use flash attention for prefill.
+    # Metadata for the prefill stage
     seq_start_loc: Optional[torch.Tensor] = None
+    query_start_loc: Optional[torch.Tensor] = None
     block_tables: Optional[torch.Tensor] = None
 
-    # Metadata for the decode stage
-    # Workspace buffer required by the kernel, the buffer should not
-    # be allocated/deacollated by the FalshInfermetadata object.
-    workspace_buffer: Optional[torch.Tensor] = None
     # An example for paged_kv_indices, paged_kv_indptr:
     # request 1, page indices [0, 5, 8]
     # request 2, page indices [1, 6, 7]
@@ -98,6 +101,7 @@ class FlashInferMetadata(AttentionMetadata):
     page_size: Optional[int] = None
     # The data type of the paged kv cache
     data_type: torch.dtype = None
+    device: torch.device = torch.device("cuda")
 
     def __post_init__(self):
         # Refer to
@@ -109,13 +113,35 @@ class FlashInferMetadata(AttentionMetadata):
                 f"Only {supported_head_sizes} are supported for head_dim,",
                 f"received {self.head_dim}.")
 
-        # When using flashinfer, we are also creating the FlashInferMetadata,
-        # which will also call post_init by default, here we want to skip the
-        # post_init if it's the prefill phase.
-        if self.num_prefills == 0:
-            assert self.num_decode_tokens > 0
-            self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-                self.workspace_buffer, "NHD")
+    def begin_forward(self):
+        if self.num_prefill_tokens > 0:
+            if self.paged_kv_indices is None:
+                return
+
+            assert self.prefill_wrapper is not None
+            assert self.paged_kv_indices is not None
+            assert self.paged_kv_indptr is not None
+            assert self.paged_kv_last_page_len is not None
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                self.device)
+            self.prefill_wrapper.begin_forward(
+                self.query_start_loc, self.paged_kv_indptr,
+                self.paged_kv_indices, self.paged_kv_last_page_len,
+                self.num_qo_heads, self.num_kv_heads, self.head_dim,
+                self.page_size)
+        else:
+            if not self.use_cuda_graph:
+                assert self.paged_kv_indices is not None
+                assert self.paged_kv_indptr is not None
+                assert self.paged_kv_last_page_len is not None
+                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                    self.device)
+
+            assert self.decode_wrapper is not None
             self.decode_wrapper.begin_forward(
                 self.paged_kv_indptr,
                 self.paged_kv_indices,
@@ -133,8 +159,9 @@ class FlashInferMetadata(AttentionMetadata):
                         ) -> Dict[str, Any]:
         if skip_fields is None:
             skip_fields = set()
-        # We need to skip the decode_wrapper field since it cannot be
+        # We need to skip the prefill/decode_wrapper field since it cannot be
         # broadcasted with nccl when TP is enabled.
+        skip_fields.add('prefill_wrapper')
         skip_fields.add('decode_wrapper')
         return super().asdict_zerocopy(skip_fields)
 
@@ -168,6 +195,7 @@ class FlashInferImpl(AttentionImpl):
         alibi_slopes: Optional[List[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -217,10 +245,14 @@ class FlashInferImpl(AttentionImpl):
                 self.kv_cache_dtype,
             )
 
+        query = query.contiguous(
+        )  # Flashinfer requires query to be contiguous
         if prefill_meta := attn_metadata.prefill_metadata:
-            # Prompt run.
-            assert prefill_meta.block_tables is not None
-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
+            # We will use flash attention for prefill
+            # when kv_cache is not provided.
+            # This happens when vllm runs the profiling to
+            # determine the number of blocks.
+            if kv_cache is None:
                 output = flash_attn_varlen_func(
                     q=query,
                     k=key,
@@ -235,13 +267,14 @@ class FlashInferImpl(AttentionImpl):
                     alibi_slopes=self.alibi_slopes,
                 )
             else:
-                raise NotImplementedError(
-                    "Prefix caching is not supported with flashinfer yet.")
+                assert prefill_meta is not None
+                assert prefill_meta.prefill_wrapper is not None
+                output = prefill_meta.prefill_wrapper.forward(query,
+                                                              kv_cache,
+                                                              causal=True)
         else:
             assert attn_metadata.decode_metadata is not None
             assert attn_metadata.decode_metadata.decode_wrapper is not None
-            query = query.contiguous(
-            )  # Flashinfer requires query to be contiguous
             output = attn_metadata.decode_metadata.decode_wrapper.forward(
                 query,
                 kv_cache,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 96f88bbf4..851bf52a5 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -77,8 +77,9 @@ def get_attn_backend(
         return IpexAttnBackend
     elif backend == _Backend.FLASHINFER:
         logger.info("Using Flashinfer backend.")
-        logger.warning("Eager mode is required for the Flashinfer backend. "
-                       "Please make sure --enforce-eager is set.")
+        logger.warning(("Flashinfer will be stuck on llma-2-7b,"
+                        " please avoid using Flashinfer as the"
+                        "backend when running on llma-2-7b."))
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
     elif backend == _Backend.PALLAS:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 082166030..942063677 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -10,6 +10,17 @@ import numpy as np
 import torch
 import torch.nn as nn
 
+try:
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
+except ImportError:
+    BatchDecodeWithPagedKVCacheWrapper = None
+    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+    BatchPrefillWithPagedKVCacheWrapper = None
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
@@ -198,11 +209,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
-        # Set if the backend is flashinfer.
-        self.flashinfer_workspace_buffer: torch.Tensor
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
 
+        self.flashinfer_decode_workspace_buffer = None
+        self.flashinfer_decode_wrapper = None
+        self.flashinfer_prefill_workspace_buffer = None
+        self.flashinfer_prefill_wrapper = None
+
     def load_model(self) -> None:
         with CudaMemoryProfiler() as m:
             self.model = get_model(
@@ -450,15 +464,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                         if curr_sliding_window_blocks is not None:
                             block_table = block_table[
                                 -curr_sliding_window_blocks:]
-                        if self.attn_backend.get_name() == "flashinfer":
-                            paged_kv_indices.extend(block_table)
-                            paged_kv_indptr.append(paged_kv_indptr[-1] +
-                                                   len(block_table))
-                            last_page_len = seq_data.get_len(
-                            ) % self.block_size
-                            if last_page_len == 0:
-                                last_page_len = self.block_size
-                            paged_kv_last_page_len.append(last_page_len)
                     else:
                         # Only happens when memory profiling runs.
                         block_table = []
@@ -505,7 +510,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     for k, v in mm_kwargs.items():
                         multi_modal_kwargs_list[k].append(v)
 
-                if _is_block_tables_empty(seq_group_metadata.block_tables):
+                is_profile_run = _is_block_tables_empty(
+                    seq_group_metadata.block_tables)
+                if is_profile_run:
                     # During memory profiling, the block tables are not
                     # initialized yet. In this case, we just use a dummy
                     # slot mapping.
@@ -544,6 +551,27 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     slot = block_number * self.block_size + block_offset
                     slot_mapping.append(slot)
 
+                # Prepare input tensors for flashinfer
+                if self.attn_backend.get_name() == "flashinfer":
+                    seq_len = seq_data.get_len()
+                    # Get the number of valid blocks based on sequence length.
+                    # If seq_len = 16, block_size = 16,
+                    # block_table_bound is 1 with 1 valid block.
+                    # If seq_len = 15, block_size = 16,
+                    # block_table_bound is 0 + 1 with 1 valid block.
+                    block_table_bound = seq_len // self.block_size + 1 \
+                                        if seq_len % self.block_size != 0 \
+                                        else seq_len // self.block_size
+
+                    paged_kv_indices.extend(block_table[:block_table_bound])
+                    paged_kv_indptr.append(paged_kv_indptr[-1] +
+                                           block_table_bound)
+
+                    last_page_len = seq_len % self.block_size
+                    if last_page_len == 0:
+                        last_page_len = self.block_size
+                    paged_kv_last_page_len.append(last_page_len)
+
         batch_size = len(input_tokens)
         max_query_len = max(query_lens)
         max_prefill_seq_len = max(prefill_seq_lens, default=0)
@@ -566,6 +594,12 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 seq_lens.append(1)
                 block_tables.append([])
                 lora_index_mapping.append(0)
+
+                if self.attn_backend.get_name() == "flashinfer":
+                    last_paged_kv_indptr = paged_kv_indptr[-1]
+                    paged_kv_indptr.append(last_paged_kv_indptr)
+                    paged_kv_last_page_len.append(0)
+
             batch_size = graph_batch_size
             num_decode_tokens = batch_size
 
@@ -589,9 +623,19 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             )
         assert max_query_len > 0, ("query_lens: {}".format(query_lens))
 
+        context_lens_tensor = torch.tensor(context_lens,
+                                           dtype=torch.int,
+                                           device=self.device)
+
         seq_lens_tensor = torch.tensor(seq_lens,
                                        dtype=torch.int,
                                        device=self.device)
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=self.device)
+        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=self.device)
         seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=self.device)
@@ -600,6 +644,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                      dim=0,
                      dtype=seq_start_loc.dtype,
                      out=seq_start_loc[1:])
+        torch.cumsum(query_lens_tensor,
+                     dim=0,
+                     dtype=query_start_loc.dtype,
+                     out=query_start_loc[1:])
 
         input_tokens_tensor = torch.tensor(input_tokens,
                                            dtype=torch.long,
@@ -612,30 +660,30 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                                            device=self.device)
 
         if self.attn_backend.get_name() == "flashinfer":
-            if not hasattr(self, "flashinfer_workspace_buffer"):
-                # Allocate 16MB workspace buffer
-                # Follow the example of flashinfer: https://docs.flashinfer.ai/api/python/decode.html
-                self.flashinfer_workspace_buffer = torch.empty(
-                    16 * 1024 * 1024, dtype=torch.uint8, device=self.device)
-            paged_kv_indptr_tensor = torch.tensor(paged_kv_indptr,
-                                                  dtype=torch.int,
-                                                  device=self.device)
-            paged_kv_indices_tensor = torch.tensor(paged_kv_indices,
-                                                   dtype=torch.int,
-                                                   device=self.device)
-            paged_kv_last_page_len_tensor = torch.tensor(
-                paged_kv_last_page_len, dtype=torch.int, device=self.device)
+            if len(paged_kv_indptr) > 0:
+                paged_kv_indices_tensor = torch.tensor(paged_kv_indices,
+                                                       device='cpu',
+                                                       dtype=torch.int)
+                paged_kv_indptr_tensor = torch.tensor(paged_kv_indptr,
+                                                      device='cpu',
+                                                      dtype=torch.int)
+                paged_kv_last_page_len_tensor = torch.tensor(
+                    paged_kv_last_page_len, device='cpu', dtype=torch.int)
+            else:
+                paged_kv_indices_tensor = None
+                paged_kv_indptr_tensor = None
+                paged_kv_last_page_len_tensor = None
+
             kv_cache_dtype = get_kv_cache_torch_dtype(self.kv_cache_dtype,
                                                       self.model_config.dtype)
+
             attn_metadata = self.attn_backend.make_metadata(
                 num_prefills=num_prefills,
                 slot_mapping=slot_mapping_tensor,
                 num_prefill_tokens=num_prefill_tokens,
                 num_decode_tokens=num_decode_tokens,
-                use_cuda_graph=False,
                 max_prefill_seq_len=max_prefill_seq_len,
                 block_tables=block_tables,
-                workspace_buffer=self.flashinfer_workspace_buffer,
                 paged_kv_indptr=paged_kv_indptr_tensor,
                 paged_kv_indices=paged_kv_indices_tensor,
                 paged_kv_last_page_len=paged_kv_last_page_len_tensor,
@@ -644,25 +692,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 num_kv_heads=self.model_config.get_num_kv_heads(
                     self.parallel_config),
                 head_dim=self.model_config.get_head_size(),
-                page_size=16,
+                page_size=self.block_size,
                 seq_start_loc=seq_start_loc,
-                data_type=kv_cache_dtype)
-        else:
-            context_lens_tensor = torch.tensor(context_lens,
-                                               dtype=torch.int,
-                                               device=self.device)
-            query_lens_tensor = torch.tensor(query_lens,
-                                             dtype=torch.long,
-                                             device=self.device)
-            query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                          dtype=torch.int32,
-                                          device=self.device)
-
-            torch.cumsum(query_lens_tensor,
-                         dim=0,
-                         dtype=query_start_loc.dtype,
-                         out=query_start_loc[1:])
+                query_start_loc=query_start_loc,
+                device=self.device,
+                data_type=kv_cache_dtype,
+                use_cuda_graph=use_captured_graph)
 
+        else:
             attn_metadata = self.attn_backend.make_metadata(
                 num_prefills=num_prefills,
                 slot_mapping=slot_mapping_tensor,
@@ -854,27 +891,97 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
         ]
 
+        if self.attn_backend.get_name() == "flashinfer":
+            # For flashinfer, different batch sizes will share the
+            # same workspace buffer.
+            decode_workspace_buffer = \
+            torch.empty(FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                                                dtype=torch.uint8,
+                                              device=self.device)
+            indices_buffer = torch.empty(max_batch_size *
+                                         self.cache_config.num_gpu_blocks,
+                                         dtype=torch.int32,
+                                         device=self.device)
+            indptr_buffer = torch.empty(max_batch_size + 1,
+                                        dtype=torch.int32,
+                                        device=self.device)
+            last_page_len_buffer = torch.empty(max_batch_size,
+                                               dtype=torch.int32,
+                                               device=self.device)
+
         with graph_capture() as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
             for batch_size in reversed(batch_size_capture_list):
-                # Create dummy attn_metadata.
-                attn_metadata = self.attn_backend.make_metadata(
-                    num_prefills=0,
-                    num_prefill_tokens=0,
-                    num_decode_tokens=batch_size,
-                    slot_mapping=slot_mapping[:batch_size],
-                    seq_lens=None,
-                    seq_lens_tensor=seq_lens[:batch_size],
-                    max_query_len=None,
-                    max_prefill_seq_len=0,
-                    max_decode_seq_len=self.max_seq_len_to_capture,
-                    query_start_loc=None,
-                    seq_start_loc=None,
-                    context_lens_tensor=None,
-                    block_tables=block_tables[:batch_size],
-                    use_cuda_graph=True,
-                )
+                if self.attn_backend.get_name() == "flashinfer":
+                    indptr_buffer = indptr_buffer[:batch_size + 1]
+                    last_page_len_buffer = last_page_len_buffer[:batch_size]
+
+                    num_qo_heads = self.model_config.get_num_attention_heads(
+                        self.parallel_config)
+                    num_kv_heads = self.model_config.get_num_kv_heads(
+                        self.parallel_config)
+                    if num_qo_heads // num_kv_heads >= 4:
+                        use_tensor_cores = True
+                    else:
+                        use_tensor_cores = False
+                    decode_wrapper = \
+                        CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
+                        decode_workspace_buffer, indptr_buffer, indices_buffer,
+                        last_page_len_buffer, "NHD", use_tensor_cores)
+                    kv_cache_dtype = get_kv_cache_torch_dtype(
+                        self.kv_cache_dtype, self.model_config.dtype)
+
+                    paged_kv_indptr_tensor_host = torch.arange(
+                        0, batch_size + 1, dtype=torch.int32)
+                    paged_kv_indices_tensor_host = torch.arange(
+                        0, batch_size, dtype=torch.int32)
+                    paged_kv_last_page_len_tensor_host = torch.full(
+                        (batch_size, ), self.block_size, dtype=torch.int32)
+                    query_start_loc_host = torch.arange(0,
+                                                        batch_size + 1,
+                                                        dtype=torch.int32)
+
+                    attn_metadata = self.attn_backend.make_metadata(
+                        num_prefills=0,
+                        slot_mapping=slot_mapping[:batch_size],
+                        num_prefill_tokens=0,
+                        num_decode_tokens=batch_size,
+                        max_prefill_seq_len=0,
+                        block_tables=block_tables,
+                        paged_kv_indptr=paged_kv_indptr_tensor_host,
+                        paged_kv_indices=paged_kv_indices_tensor_host,
+                        paged_kv_last_page_len=
+                        paged_kv_last_page_len_tensor_host,
+                        num_qo_heads=num_qo_heads,
+                        num_kv_heads=num_kv_heads,
+                        head_dim=self.model_config.get_head_size(),
+                        page_size=self.block_size,
+                        seq_start_loc=None,
+                        query_start_loc=query_start_loc_host,
+                        device=self.device,
+                        data_type=kv_cache_dtype,
+                        use_cuda_graph=True,
+                        decode_wrapper=decode_wrapper,
+                        prefill_wrapper=None)
+                    attn_metadata.begin_forward()
+                else:
+                    attn_metadata = self.attn_backend.make_metadata(
+                        num_prefills=0,
+                        num_prefill_tokens=0,
+                        num_decode_tokens=batch_size,
+                        slot_mapping=slot_mapping[:batch_size],
+                        seq_lens=None,
+                        seq_lens_tensor=seq_lens[:batch_size],
+                        max_query_len=None,
+                        max_prefill_seq_len=0,
+                        max_decode_seq_len=self.max_seq_len_to_capture,
+                        query_start_loc=None,
+                        seq_start_loc=None,
+                        context_lens_tensor=None,
+                        block_tables=block_tables[:batch_size],
+                        use_cuda_graph=True,
+                    )
 
                 if self.lora_config:
                     lora_mapping = LoRAMapping(
@@ -883,8 +990,20 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     )
                     self.set_active_loras(set(), lora_mapping)
 
-                graph_runner = CUDAGraphRunner(self.model)
-                hidden_states = graph_runner.capture(
+                graph_runner = CUDAGraphRunner(self.model,
+                                               self.attn_backend.get_name())
+
+                if self.attn_backend.get_name() == "flashinfer":
+                    graph_runner.flashinfer_indptr_buffer = indptr_buffer
+                    graph_runner.flashinfer_indices_buffer = indices_buffer
+                    graph_runner.flashinfer_last_page_len_buffer = \
+                        last_page_len_buffer
+                    graph_runner.flashinfer_decode_workspace_buffer = \
+                            decode_workspace_buffer
+                    graph_runner.flashinfer_decode_wrapper = \
+                        decode_wrapper
+
+                graph_runner.capture(
                     input_tokens[:batch_size],
                     input_positions[:batch_size],
                     hidden_states[:batch_size]
@@ -918,11 +1037,12 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
         self,
         tensor_dict: Dict[str, Any],
     ) -> ModelInputForGPUWithSamplingMetadata:
-        return (
+        model_input = \
             ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
                 tensor_dict,
                 attn_backend=self.attn_backend,
-            ))
+            )
+        return model_input
 
     def prepare_model_input(
         self,
@@ -970,6 +1090,36 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
             self.set_active_loras(model_input.lora_requests,
                                   model_input.lora_mapping)
 
+        if self.attn_backend.get_name() == "flashinfer":
+            assert model_input.attn_metadata is not None
+            assert model_input.input_tokens is not None
+            if self.flashinfer_decode_workspace_buffer is None:
+                self.flashinfer_decode_workspace_buffer = torch.empty(
+                    FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                    dtype=torch.uint8,
+                    device=self.device)
+                self.flashinfer_decode_wrapper = \
+                    BatchDecodeWithPagedKVCacheWrapper(
+                    self.flashinfer_decode_workspace_buffer, "NHD")
+                self.flashinfer_prefill_workspace_buffer = torch.empty(
+                    FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                    dtype=torch.uint8,
+                    device=self.device)
+                self.flashinfer_prefill_wrapper = \
+                    BatchPrefillWithPagedKVCacheWrapper(
+                    self.flashinfer_prefill_workspace_buffer, "NHD")
+
+            model_input.attn_metadata.prefill_wrapper = \
+                self.flashinfer_prefill_wrapper
+            if model_input.attn_metadata.use_cuda_graph:
+                batch_size = model_input.input_tokens.shape[0]
+                model_input.attn_metadata.decode_wrapper = self.graph_runners[
+                    batch_size].flashinfer_decode_wrapper
+            else:
+                model_input.attn_metadata.decode_wrapper = \
+                    self.flashinfer_decode_wrapper
+            model_input.attn_metadata.begin_forward()
+
         # Currently cuda graph is only supported by the decode phase.
         assert model_input.attn_metadata is not None
         prefill_meta = model_input.attn_metadata.prefill_metadata
@@ -1020,13 +1170,22 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
 
 class CUDAGraphRunner:
 
-    def __init__(self, model: nn.Module):
+    def __init__(self, model: nn.Module, backend_name: str):
         self.model = model
+        self.backend_name = backend_name
+
         self.input_buffers: Dict[str, torch.Tensor] = {}
         self.output_buffers: Dict[str, torch.Tensor] = {}
 
         self._graph: Optional[torch.cuda.CUDAGraph] = None
 
+        self.flashinfer_decode_workspace_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_indptr_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_indices_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_last_page_len_buffer: Optional[torch.Tensor] = None
+        self.flashinfer_decode_wrapper: Optional[
+            CUDAGraphBatchDecodeWithPagedKVCacheWrapper] = None
+
     @property
     def graph(self):
         assert self._graph is not None
@@ -1079,14 +1238,23 @@ class CUDAGraphRunner:
         torch.cuda.synchronize()
 
         # Save the input and output buffers.
-        self.input_buffers = {
-            "input_ids": input_ids,
-            "positions": positions,
-            "kv_caches": kv_caches,
-            "slot_mapping": attn_metadata.slot_mapping,
-            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
-            "block_tables": attn_metadata.decode_metadata.block_tables,
-        }
+        if self.backend_name == "flashinfer":
+            self.input_buffers = {
+                "input_ids": input_ids,
+                "positions": positions,
+                "kv_caches": kv_caches,
+                "slot_mapping": attn_metadata.slot_mapping,
+            }
+        else:
+            self.input_buffers = {
+                "input_ids": input_ids,
+                "positions": positions,
+                "kv_caches": kv_caches,
+                "slot_mapping": attn_metadata.slot_mapping,
+                "seq_lens_tensor":
+                attn_metadata.decode_metadata.seq_lens_tensor,
+                "block_tables": attn_metadata.decode_metadata.block_tables,
+            }
         self.output_buffers = {"hidden_states": hidden_states}
         return hidden_states
 
@@ -1106,10 +1274,12 @@ class CUDAGraphRunner:
         self.input_buffers["positions"].copy_(positions, non_blocking=True)
         self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
                                                  non_blocking=True)
-        self.input_buffers["seq_lens_tensor"].copy_(
-            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
-        self.input_buffers["block_tables"].copy_(
-            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        if self.backend_name != "flashinfer":
+            self.input_buffers["seq_lens_tensor"].copy_(
+                attn_metadata.decode_metadata.seq_lens_tensor,
+                non_blocking=True)
+            self.input_buffers["block_tables"].copy_(
+                attn_metadata.decode_metadata.block_tables, non_blocking=True)
         # Run the graph.
         self.graph.replay()
 
-- 
GitLab


From 54814fd85b5182fc140febfebbb2560420d2ed2a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Jun 2024 18:14:16 -0700
Subject: [PATCH 197/376] [Bugfix][TPU] Fix TPU sampler output (#5978)

---
 vllm/worker/tpu_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 37d810e83..d58c7dc99 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -215,7 +215,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         assert len(seq_group_metadata_list) > 0
         output = self.model_runner.execute_model(seq_group_metadata_list,
                                                  self.tpu_cache)
-        return [output]
+        return output
 
     def cache_swap(
         self,
-- 
GitLab


From 7f83f40dee2e92ff005d44a262a3cf42c87c2082 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Jun 2024 18:55:17 -0700
Subject: [PATCH 198/376] [Bugfix][TPU] Fix pad slot id (#5977)

---
 vllm/worker/tpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 7827f7c74..dd08536ef 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -19,7 +19,7 @@ from vllm.utils import make_tensor_with_pad
 
 logger = init_logger(__name__)
 
-_PAD_SLOT_ID = 0  # FIXME(woosuk)
+_PAD_SLOT_ID = -1  # NOTE(woosuk): In PyTorch XLA, index -1 is ignored.
 # FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
 _ENABLE_TOP_P = False
 # FIXME(woosuk): A temporary hack to support `n > 1`.
-- 
GitLab


From c4bca740e8498987184466d2f85ed43f1e1feb80 Mon Sep 17 00:00:00 2001
From: mcalman <68564154+mcalman@users.noreply.github.com>
Date: Fri, 28 Jun 2024 22:34:42 -0400
Subject: [PATCH 199/376] [Bugfix] fix missing last itl in openai completions
 benchmark (#5926)

---
 benchmarks/backend_request_func.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4350b96b0..5b5067090 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -265,6 +265,9 @@ async def async_request_openai_completions(
                         else:
                             data = json.loads(chunk)
 
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
                             if data["choices"][0]["text"]:
                                 timestamp = time.perf_counter()
                                 # First token
@@ -273,12 +276,8 @@ async def async_request_openai_completions(
                                     output.ttft = ttft
 
                                 # Decoding phase
-                                # NOTE: Some completion API might have a last
-                                # usage summary response without a token so we
-                                # do not want to include as inter-token-latency
-                                elif data.get("usage", None) is None:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)
 
                                 most_recent_timestamp = timestamp
                                 generated_text += data["choices"][0]["text"]
-- 
GitLab


From 906a19cdb06b390f3dde287b06a3fe26c03a45e5 Mon Sep 17 00:00:00 2001
From: William Lin <SolitaryThinker@users.noreply.github.com>
Date: Fri, 28 Jun 2024 19:36:06 -0700
Subject: [PATCH 200/376] [Misc] Extend vLLM Metrics logging API (#5925)

Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
---
 tests/metrics/test_metrics.py |  12 +-
 vllm/engine/llm_engine.py     |  38 ++++-
 vllm/engine/metrics.py        | 293 ++++++++++++++++++++++------------
 3 files changed, 225 insertions(+), 118 deletions(-)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index c1164739e..0191d8519 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -39,7 +39,7 @@ def test_metric_counter_prompt_tokens(
         vllm_prompt_token_count = sum(prompt_token_counts)
 
         _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
             **stat_logger.labels)._value.get()
 
@@ -64,7 +64,7 @@ def test_metric_counter_generation_tokens(
                      gpu_memory_utilization=0.4) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_generation_tokens.labels(
             **stat_logger.labels)._value.get()
         vllm_generation_count = 0
@@ -92,7 +92,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                      disable_log_stats=False,
                      gpu_memory_utilization=0.3,
                      served_model_name=served_model_name) as vllm_model:
-        stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
@@ -172,10 +172,10 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
                    num_requests: int) -> None:
     if disable_log_stats:
         with pytest.raises(AttributeError):
-            _ = engine.stat_logger
+            _ = engine.stat_loggers
     else:
-        assert (engine.stat_logger
-                is not None), "engine.stat_logger should be set"
+        assert (engine.stat_loggers
+                is not None), "engine.stat_loggers should be set"
         # Ensure the count bucket of request-level histogram metrics matches
         # the number of requests as a simple sanity check to ensure metrics are
         # generated
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index fde18f60e..808a639f5 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -13,7 +13,8 @@ from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.metrics import StatLogger, Stats
+from vllm.engine.metrics import (LoggingStatLogger, PrometheusStatLogger,
+                                 StatLoggerBase, Stats)
 from vllm.engine.output_processor.interfaces import (
     SequenceGroupOutputProcessor)
 from vllm.engine.output_processor.stop_checker import StopChecker
@@ -160,6 +161,7 @@ class LLMEngine:
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
     ) -> None:
         logger.info(
             "Initializing an LLM engine (v%s) with config: "
@@ -292,11 +294,21 @@ class LLMEngine:
 
         # Metric Logging.
         if self.log_stats:
-            self.stat_logger = StatLogger(
-                local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                labels=dict(model_name=model_config.served_model_name),
-                max_model_len=self.model_config.max_model_len)
-            self.stat_logger.info("cache_config", self.cache_config)
+            if stat_loggers is not None:
+                self.stat_loggers = stat_loggers
+            else:
+                self.stat_loggers = {
+                    "logging":
+                    LoggingStatLogger(
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC),
+                    "prometheus":
+                    PrometheusStatLogger(
+                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                        labels=dict(model_name=model_config.served_model_name),
+                        max_model_len=self.model_config.max_model_len),
+                }
+                self.stat_loggers["prometheus"].info("cache_config",
+                                                     self.cache_config)
 
         self.tracer = None
         if self.observability_config.otlp_traces_endpoint:
@@ -833,14 +845,24 @@ class LLMEngine:
 
         return request_outputs
 
+    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
+        if logger_name in self.stat_loggers:
+            raise KeyError(f"Logger with name {logger_name} already exists.")
+        self.stat_loggers[logger_name] = logger
+
+    def remove_logger(self, logger_name: str) -> None:
+        if logger_name not in self.stat_loggers:
+            raise KeyError(f"Logger with name {logger_name} does not exist.")
+        del self.stat_loggers[logger_name]
+
     def do_log_stats(
             self,
             scheduler_outputs: Optional[SchedulerOutputs] = None,
             model_output: Optional[List[SamplerOutput]] = None) -> None:
         """Forced log when no requests active."""
         if self.log_stats:
-            self.stat_logger.log(
-                self._get_stats(scheduler_outputs, model_output))
+            for logger in self.stat_loggers.values():
+                logger.log(self._get_stats(scheduler_outputs, model_output))
 
     def _get_stats(
             self,
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 027f5c7e7..2c1210c90 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,21 +1,27 @@
 import time
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from typing import Counter as CollectionsCounter
 from typing import Dict, List, Optional, Protocol, Union
 
 import numpy as np
-from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
-                               disable_created_metrics)
+import prometheus_client
 
+from vllm.executor.ray_utils import ray
 from vllm.logger import init_logger
 
+if ray is not None:
+    from ray.util import metrics as ray_metrics
+else:
+    ray_metrics = None
+
 if TYPE_CHECKING:
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 logger = init_logger(__name__)
 
-disable_created_metrics()
+prometheus_client.disable_created_metrics()
 
 # The begin-* and end* here are used by the documentation generator
 # to extract the metrics definitions.
@@ -24,56 +30,55 @@ disable_created_metrics()
 # begin-metrics-definitions
 class Metrics:
     labelname_finish_reason = "finished_reason"
+    _base_library = prometheus_client
 
     def __init__(self, labelnames: List[str], max_model_len: int):
         # Unregister any existing vLLM collectors
-        for collector in list(REGISTRY._collector_to_names):
-            if hasattr(collector, "_name") and "vllm" in collector._name:
-                REGISTRY.unregister(collector)
+        self._unregister_vllm_metrics()
 
         # Config Information
-        self.info_cache_config = Info(
+        self.info_cache_config = prometheus_client.Info(
             name='vllm:cache_config',
             documentation='information of cache_config')
 
         # System stats
         #   Scheduler State
-        self.gauge_scheduler_running = Gauge(
+        self.gauge_scheduler_running = self._base_library.Gauge(
             name="vllm:num_requests_running",
             documentation="Number of requests currently running on GPU.",
             labelnames=labelnames)
-        self.gauge_scheduler_waiting = Gauge(
+        self.gauge_scheduler_waiting = self._base_library.Gauge(
             name="vllm:num_requests_waiting",
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames)
-        self.gauge_scheduler_swapped = Gauge(
+        self.gauge_scheduler_swapped = self._base_library.Gauge(
             name="vllm:num_requests_swapped",
             documentation="Number of requests swapped to CPU.",
             labelnames=labelnames)
         #   KV Cache Usage in %
-        self.gauge_gpu_cache_usage = Gauge(
+        self.gauge_gpu_cache_usage = self._base_library.Gauge(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames)
-        self.gauge_cpu_cache_usage = Gauge(
+        self.gauge_cpu_cache_usage = self._base_library.Gauge(
             name="vllm:cpu_cache_usage_perc",
             documentation="CPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames)
 
         # Iteration stats
-        self.counter_num_preemption = Counter(
+        self.counter_num_preemption = self._base_library.Counter(
             name="vllm:num_preemptions_total",
             documentation="Cumulative number of preemption from the engine.",
             labelnames=labelnames)
-        self.counter_prompt_tokens = Counter(
+        self.counter_prompt_tokens = self._base_library.Counter(
             name="vllm:prompt_tokens_total",
             documentation="Number of prefill tokens processed.",
             labelnames=labelnames)
-        self.counter_generation_tokens = Counter(
+        self.counter_generation_tokens = self._base_library.Counter(
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
-        self.histogram_time_to_first_token = Histogram(
+        self.histogram_time_to_first_token = self._base_library.Histogram(
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
             labelnames=labelnames,
@@ -81,7 +86,7 @@ class Metrics:
                 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
                 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
             ])
-        self.histogram_time_per_output_token = Histogram(
+        self.histogram_time_per_output_token = self._base_library.Histogram(
             name="vllm:time_per_output_token_seconds",
             documentation="Histogram of time per output token in seconds.",
             labelnames=labelnames,
@@ -92,54 +97,77 @@ class Metrics:
 
         # Request stats
         #   Latency
-        self.histogram_e2e_time_request = Histogram(
+        self.histogram_e2e_time_request = self._base_library.Histogram(
             name="vllm:e2e_request_latency_seconds",
             documentation="Histogram of end to end request latency in seconds.",
             labelnames=labelnames,
             buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
         #   Metadata
-        self.histogram_num_prompt_tokens_request = Histogram(
+        self.histogram_num_prompt_tokens_request = self._base_library.Histogram(
             name="vllm:request_prompt_tokens",
             documentation="Number of prefill tokens processed.",
             labelnames=labelnames,
             buckets=build_1_2_5_buckets(max_model_len),
         )
-        self.histogram_num_generation_tokens_request = Histogram(
-            name="vllm:request_generation_tokens",
-            documentation="Number of generation tokens processed.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.histogram_best_of_request = Histogram(
+        self.histogram_num_generation_tokens_request = \
+            self._base_library.Histogram(
+                name="vllm:request_generation_tokens",
+                documentation="Number of generation tokens processed.",
+                labelnames=labelnames,
+                buckets=build_1_2_5_buckets(max_model_len),
+            )
+        self.histogram_best_of_request = self._base_library.Histogram(
             name="vllm:request_params_best_of",
             documentation="Histogram of the best_of request parameter.",
             labelnames=labelnames,
             buckets=[1, 2, 5, 10, 20],
         )
-        self.histogram_n_request = Histogram(
+        self.histogram_n_request = self._base_library.Histogram(
             name="vllm:request_params_n",
             documentation="Histogram of the n request parameter.",
             labelnames=labelnames,
             buckets=[1, 2, 5, 10, 20],
         )
-        self.counter_request_success = Counter(
+        self.counter_request_success = self._base_library.Counter(
             name="vllm:request_success_total",
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + [Metrics.labelname_finish_reason])
 
         # Deprecated in favor of vllm:prompt_tokens_total
-        self.gauge_avg_prompt_throughput = Gauge(
+        self.gauge_avg_prompt_throughput = self._base_library.Gauge(
             name="vllm:avg_prompt_throughput_toks_per_s",
             documentation="Average prefill throughput in tokens/s.",
             labelnames=labelnames,
         )
         # Deprecated in favor of vllm:generation_tokens_total
-        self.gauge_avg_generation_throughput = Gauge(
+        self.gauge_avg_generation_throughput = self._base_library.Gauge(
             name="vllm:avg_generation_throughput_toks_per_s",
             documentation="Average generation throughput in tokens/s.",
             labelnames=labelnames,
         )
 
+    def _unregister_vllm_metrics(self) -> None:
+        for collector in list(self._base_library.REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                self._base_library.REGISTRY.unregister(collector)
+
+
+class RayMetrics(Metrics):
+    """
+    RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics.
+    Provides the same metrics as Metrics but uses Ray's util.metrics library.
+    """
+    _base_library = ray_metrics
+
+    def __init__(self, labelnames: List[str], max_model_len: int):
+        if ray_metrics is None:
+            raise ImportError("RayMetrics requires Ray to be installed.")
+        super().__init__(labelnames, max_model_len)
+
+    def _unregister_vllm_metrics(self) -> None:
+        # No-op on purpose
+        pass
+
 
 # end-metrics-definitions
 
@@ -206,34 +234,136 @@ class SupportsMetricsInfo(Protocol):
         ...
 
 
-class StatLogger:
-    """StatLogger is used LLMEngine to log to Promethus and Stdout."""
+def local_interval_elapsed(now: float, last_log: float,
+                           local_interval: float) -> bool:
+    elapsed_time = now - last_log
+    return elapsed_time > local_interval
+
+
+def get_throughput(tracked_stats: List[int], now: float,
+                   last_log: float) -> float:
+    return float(np.sum(tracked_stats) / (now - last_log))
 
-    def __init__(self, local_interval: float, labels: Dict[str, str],
-                 max_model_len: int) -> None:
-        # Metadata for logging locally.
-        self.last_local_log = time.time()
-        self.local_interval = local_interval
 
+class StatLoggerBase(ABC):
+    """Base class for StatLogger."""
+
+    def __init__(self, local_interval: float) -> None:
         # Tracked stats over current local logging interval.
         self.num_prompt_tokens: List[int] = []
         self.num_generation_tokens: List[int] = []
+        self.last_local_log = time.time()
+        self.local_interval = local_interval
+
+    @abstractmethod
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def log(self, stats: Stats) -> None:
+        raise NotImplementedError
 
+
+class LoggingStatLogger(StatLoggerBase):
+    """LoggingStatLogger is used in LLMEngine to log to Stdout."""
+
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        raise NotImplementedError
+
+    def log(self, stats: Stats) -> None:
+        """Called by LLMEngine.
+           Logs to Stdout every self.local_interval seconds."""
+
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
+        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
+
+        # Log locally every local_interval seconds.
+        if local_interval_elapsed(stats.now, self.last_local_log,
+                                  self.local_interval):
+            # Compute summary metrics for tracked stats (and log them
+            # to promethus if applicable).
+            prompt_throughput = get_throughput(self.num_prompt_tokens,
+                                               now=stats.now,
+                                               last_log=self.last_local_log)
+            generation_throughput = get_throughput(
+                self.num_generation_tokens,
+                now=stats.now,
+                last_log=self.last_local_log)
+
+            # Log to stdout.
+            logger.info(
+                "Avg prompt throughput: %.1f tokens/s, "
+                "Avg generation throughput: %.1f tokens/s, "
+                "Running: %d reqs, Swapped: %d reqs, "
+                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
+                "CPU KV cache usage: %.1f%%.",
+                prompt_throughput,
+                generation_throughput,
+                stats.num_running_sys,
+                stats.num_swapped_sys,
+                stats.num_waiting_sys,
+                stats.gpu_cache_usage_sys * 100,
+                stats.cpu_cache_usage_sys * 100,
+            )
+
+            # Reset tracked stats for next interval.
+            self.num_prompt_tokens = []
+            self.num_generation_tokens = []
+            self.last_local_log = stats.now
+
+            if stats.spec_decode_metrics is not None:
+                logger.info(
+                    self._format_spec_decode_metrics_str(
+                        stats.spec_decode_metrics))
+
+    def _format_spec_decode_metrics_str(
+            self, metrics: "SpecDecodeWorkerMetrics") -> str:
+
+        return ("Speculative metrics: "
+                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
+                f"System efficiency: {metrics.system_efficiency:.3f}, "
+                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
+                f"Number of accepted tokens: {metrics.accepted_tokens}, "
+                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
+                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
+
+
+class PrometheusStatLogger(StatLoggerBase):
+    """PrometheusStatLogger is used LLMEngine to log to Promethus."""
+    _metrics_cls = Metrics
+
+    def __init__(self, local_interval: float, labels: Dict[str, str],
+                 max_model_len: int) -> None:
+        super().__init__(local_interval)
         # Prometheus metrics
         self.labels = labels
-        self.metrics = Metrics(labelnames=list(labels.keys()),
-                               max_model_len=max_model_len)
+        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
+                                         max_model_len=max_model_len)
 
     def info(self, type: str, obj: SupportsMetricsInfo) -> None:
         if type == "cache_config":
             self.metrics.info_cache_config.info(obj.metrics_info())
 
-    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
-        return float(np.sum(tracked_stats) / (now - self.last_local_log))
+    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+        # Convenience function for logging to gauge.
+        gauge.labels(**self.labels).set(data)
 
-    def _local_interval_elapsed(self, now: float) -> bool:
-        elapsed_time = now - self.last_local_log
-        return elapsed_time > self.local_interval
+    def _log_counter(self, counter, data: Union[int, float]) -> None:
+        # Convenience function for logging to counter.
+        counter.labels(**self.labels).inc(data)
+
+    def _log_counter_labels(self, counter, data: CollectionsCounter,
+                            label_key: str) -> None:
+        # Convenience function for collection counter of labels.
+        for label, count in data.items():
+            counter.labels(**{**self.labels, label_key: label}).inc(count)
+
+    def _log_histogram(self, histogram, data: Union[List[int],
+                                                    List[float]]) -> None:
+        # Convenience function for logging list to histogram.
+        for datum in data:
+            histogram.labels(**self.labels).observe(datum)
 
     def _log_prometheus(self, stats: Stats) -> None:
         # System state data
@@ -279,26 +409,6 @@ class StatLogger:
         self._log_histogram(self.metrics.histogram_best_of_request,
                             stats.best_of_requests)
 
-    def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
-        # Convenience function for logging to gauge.
-        gauge.labels(**self.labels).set(data)
-
-    def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
-        # Convenience function for logging to counter.
-        counter.labels(**self.labels).inc(data)
-
-    def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
-                            label_key: str) -> None:
-        # Convenience function for collection counter of labels.
-        for label, count in data.items():
-            counter.labels(**{**self.labels, label_key: label}).inc(count)
-
-    def _log_histogram(self, histogram: Histogram,
-                       data: Union[List[int], List[float]]) -> None:
-        # Convenience function for logging list to histogram.
-        for datum in data:
-            histogram.labels(**self.labels).observe(datum)
-
     def _log_prometheus_interval(self, prompt_throughput: float,
                                  generation_throughput: float) -> None:
         # Logs metrics to prometheus that are computed every logging_interval.
@@ -313,11 +423,8 @@ class StatLogger:
         self.metrics.gauge_avg_generation_throughput.labels(
             **self.labels).set(generation_throughput)
 
-    def log(self, stats: Stats) -> None:
-        """Called by LLMEngine.
-           Logs to prometheus and tracked stats every iteration.
-           Logs to Stdout every self.local_interval seconds."""
-
+    def log(self, stats: Stats):
+        """Logs to prometheus and tracked stats every iteration."""
         # Log to prometheus.
         self._log_prometheus(stats)
 
@@ -326,50 +433,28 @@ class StatLogger:
         self.num_generation_tokens.append(stats.num_generation_tokens_iter)
 
         # Log locally every local_interval seconds.
-        if self._local_interval_elapsed(stats.now):
+        if local_interval_elapsed(stats.now, self.last_local_log,
+                                  self.local_interval):
             # Compute summary metrics for tracked stats (and log them
             # to promethus if applicable).
-            prompt_throughput = self._get_throughput(self.num_prompt_tokens,
-                                                     now=stats.now)
-            generation_throughput = self._get_throughput(
-                self.num_generation_tokens, now=stats.now)
+            prompt_throughput = get_throughput(self.num_prompt_tokens,
+                                               now=stats.now,
+                                               last_log=self.last_local_log)
+            generation_throughput = get_throughput(
+                self.num_generation_tokens,
+                now=stats.now,
+                last_log=self.last_local_log)
+
             self._log_prometheus_interval(
                 prompt_throughput=prompt_throughput,
                 generation_throughput=generation_throughput)
 
-            # Log to stdout.
-            logger.info(
-                "Avg prompt throughput: %.1f tokens/s, "
-                "Avg generation throughput: %.1f tokens/s, "
-                "Running: %d reqs, Swapped: %d reqs, "
-                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
-                "CPU KV cache usage: %.1f%%.",
-                prompt_throughput,
-                generation_throughput,
-                stats.num_running_sys,
-                stats.num_swapped_sys,
-                stats.num_waiting_sys,
-                stats.gpu_cache_usage_sys * 100,
-                stats.cpu_cache_usage_sys * 100,
-            )
-
             # Reset tracked stats for next interval.
             self.num_prompt_tokens = []
             self.num_generation_tokens = []
             self.last_local_log = stats.now
 
-            if stats.spec_decode_metrics is not None:
-                logger.info(
-                    self._format_spec_decode_metrics_str(
-                        stats.spec_decode_metrics))
-
-    def _format_spec_decode_metrics_str(
-            self, metrics: "SpecDecodeWorkerMetrics") -> str:
 
-        return ("Speculative metrics: "
-                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
-                f"System efficiency: {metrics.system_efficiency:.3f}, "
-                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
-                f"Number of accepted tokens: {metrics.accepted_tokens}, "
-                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
-                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
+class RayPrometheusStatLogger(PrometheusStatLogger):
+    """RayPrometheusStatLogger uses Ray metrics instead."""
+    _metrics_cls = RayMetrics
-- 
GitLab


From ba4994443afc6a8249ed726c5ebd09b2c57a3b00 Mon Sep 17 00:00:00 2001
From: Joe Runde <joe@joerun.de>
Date: Fri, 28 Jun 2024 20:48:25 -0600
Subject: [PATCH 201/376] [Kernel] Add punica dimensions for Granite 3b and 8b
 (#5930)

Signed-off-by: Joe Runde <joe@joerun.de>
---
 csrc/punica/bgmv/bgmv_config.h | 2 ++
 tests/lora/test_punica.py      | 1 +
 2 files changed, 3 insertions(+)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index cb6694b30..2c8d007d8 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -86,6 +86,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 36864) \
     f(in_T, out_T, W_T, narrow, 43264) \
     f(in_T, out_T, W_T, narrow, 49152) \
+    f(in_T, out_T, W_T, narrow, 49408) \
     f(in_T, out_T, W_T, narrow, 60544) \
     f(in_T, out_T, W_T, narrow, 60672) \
     f(in_T, out_T, W_T, narrow, 64000) \
@@ -182,6 +183,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 36864, narrow) \
     f(in_T, out_T, W_T, 43264, narrow) \
     f(in_T, out_T, W_T, 49152, narrow) \
+    f(in_T, out_T, W_T, 49408, narrow) \
     f(in_T, out_T, W_T, 60544, narrow) \
     f(in_T, out_T, W_T, 60672, narrow) \
     f(in_T, out_T, W_T, 64000, narrow) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index 110c9b243..dbeb16cb2 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -111,6 +111,7 @@ H1 = H2 = [
     36864,
     43264,
     49152,
+    49408,
     60544,
     60672,
     64000,
-- 
GitLab


From 580353da93ee0d96a19964241e16f92e6a6d6142 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Jun 2024 20:10:21 -0700
Subject: [PATCH 202/376] [Bugfix] Fix precisions in Gemma 1 (#5913)

---
 tests/models/test_models.py         |  1 +
 vllm/model_executor/models/gemma.py | 25 +++++++++++--------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 71238d690..4453b4b9f 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -17,6 +17,7 @@ MODELS = [
     "stabilityai/stablelm-3b-4e1t",
     # "allenai/OLMo-1B",  # Broken
     "bigcode/starcoder2-3b",
+    "google/gemma-1.1-2b-it",
 ]
 
 
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index ce97fc808..efefb3481 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -26,14 +26,14 @@ from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
@@ -148,12 +148,14 @@ class GemmaAttention(nn.Module):
             quant_config=quant_config,
         )
 
-        self.rotary_emb = get_rope(
+        # TODO(woosuk): Use the `get_rope` interface.
+        self.rotary_emb = GemmaRotaryEmbedding(
             self.head_dim,
             rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
+            max_position_embeddings=max_position_embeddings,
             base=self.rope_theta,
             is_neox_style=True,
+            dtype=torch.get_default_dtype(),
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -204,10 +206,10 @@ class GemmaDecoderLayer(nn.Module):
             hidden_activation=getattr(config, "hidden_activation", None),
             quant_config=quant_config,
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -257,7 +259,7 @@ class GemmaModel(nn.Module):
             GemmaDecoderLayer(config, cache_config, quant_config)
             for _ in range(config.num_hidden_layers)
         ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         # Normalize the embedding by sqrt(hidden_size)
         # The normalizer's data type should be downcasted to the model's
@@ -331,7 +333,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA):
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
-    @torch.no_grad()
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -388,10 +389,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # GemmaRMSNorm is different from Llama's in that it multiplies
-                # (1 + weight) to the output, instead of just weight.
-                if "norm.weight" in name:
-                    loaded_weight += 1.0
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-- 
GitLab


From 329df38f1a931215062d7b43660ceee1f83c0ab5 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 28 Jun 2024 23:34:29 -0700
Subject: [PATCH 203/376] [Misc] Update Phi-3-Vision Example (#5981)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 examples/phi3v_example.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 46b7be5cd..f0b9b0e4f 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -10,8 +10,10 @@ from vllm.multimodal.image import ImagePixelData
 def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
 
-    # Note: The model has 128k context length by default which may cause OOM
-    # In this example, we override max_model_len to 2048.
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (128k) for this model may cause OOM.
+    # In this example, we override max_num_seqs to 5 while
+    # keeping the original context length of 128k.
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
@@ -19,7 +21,7 @@ def run_phi3v():
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
-        max_model_len=2048,
+        max_num_seqs=5,
     )
 
     image = Image.open("images/cherry_blossom.jpg")
-- 
GitLab


From 51e971d39e1272f1c5b070a5da6b38ccfa92fc14 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 19:19:02 +0800
Subject: [PATCH 204/376] [Bugfix] Support `eos_token_id` from `config.json`
 (#5954)

---
 tests/tokenization/test_get_eos.py | 31 ++++++++++++++++++++++++++++++
 vllm/engine/llm_engine.py          | 23 ++++++++++++----------
 vllm/transformers_utils/config.py  | 24 ++++++++++++++++++++++-
 3 files changed, 67 insertions(+), 11 deletions(-)
 create mode 100644 tests/tokenization/test_get_eos.py

diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
new file mode 100644
index 000000000..875ca19d3
--- /dev/null
+++ b/tests/tokenization/test_get_eos.py
@@ -0,0 +1,31 @@
+"""
+This test file includes some cases where it is inappropriate to
+only get the `eos_token_id` from the tokenizer as defined by
+:meth:`vllm.LLMEngine._get_eos_token_id`.
+"""
+from vllm.transformers_utils.config import try_get_generation_config
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def test_get_llama3_eos_token():
+    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 128009
+
+    generation_config = try_get_generation_config(model_name,
+                                                  trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == [128001, 128009]
+
+
+def test_get_blip2_eos_token():
+    model_name = "Salesforce/blip2-opt-2.7b"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 2
+
+    generation_config = try_get_generation_config(model_name,
+                                                  trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == 50118
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 808a639f5..f7e38c0e6 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,10 +1,10 @@
 import time
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, ClassVar, Dict, Iterable, List, Optional
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Set, Type, TypeVar, Union
 
-from transformers import GenerationConfig, PreTrainedTokenizer
+from transformers import PreTrainedTokenizer
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
                          LoRAConfig, ModelConfig, ObservabilityConfig,
@@ -34,6 +34,7 @@ from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
                            SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
+from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
@@ -46,16 +47,18 @@ logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
 
-def _load_generation_config_dict(model_config: ModelConfig):
-    try:
-        return GenerationConfig.from_pretrained(
-            model_config.model,
-            revision=model_config.revision,
-        ).to_diff_dict()
-    except OSError:
-        # Not found.
+def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
+    config = try_get_generation_config(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.revision,
+    )
+
+    if config is None:
         return {}
 
+    return config.to_diff_dict()
+
 
 _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 60fc756a1..5e2fe116d 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,7 +1,7 @@
 import contextlib
 from typing import Dict, Optional, Type
 
-from transformers import PretrainedConfig
+from transformers import GenerationConfig, PretrainedConfig
 
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
@@ -80,3 +80,25 @@ def get_hf_text_config(config: PretrainedConfig):
         return config.text_config
     else:
         return config
+
+
+def try_get_generation_config(
+    model: str,
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[GenerationConfig]:
+    try:
+        return GenerationConfig.from_pretrained(
+            model,
+            revision=revision,
+        )
+    except OSError:  # Not found
+        try:
+            config = get_config(
+                model,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+            )
+            return GenerationConfig.from_model_config(config)
+        except OSError:  # Not found
+            return None
-- 
GitLab


From 7c01f706418d593b3cf23d2ec9110dca7151c539 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Sat, 29 Jun 2024 05:47:53 -0700
Subject: [PATCH 205/376] [Core] Optimize `SequenceStatus.is_finished` by
 switching to IntEnum (#5974)

---
 vllm/sequence.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 13746cef2..22cb26dc0 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -39,24 +39,21 @@ PromptLogprobs = List[Optional[Dict[int, Logprob]]]
 SampleLogprobs = List[Dict[int, Logprob]]
 
 
-class SequenceStatus(enum.Enum):
+class SequenceStatus(enum.IntEnum):
     """Status of a sequence."""
-    WAITING = enum.auto()
-    RUNNING = enum.auto()
-    SWAPPED = enum.auto()
-    FINISHED_STOPPED = enum.auto()
-    FINISHED_LENGTH_CAPPED = enum.auto()
-    FINISHED_ABORTED = enum.auto()
-    FINISHED_IGNORED = enum.auto()
+    WAITING = 0
+    RUNNING = 1
+    SWAPPED = 2
+    # Note: anything after SWAPPED (2) will be considered
+    # as a finished status.
+    FINISHED_STOPPED = 3
+    FINISHED_LENGTH_CAPPED = 4
+    FINISHED_ABORTED = 5
+    FINISHED_IGNORED = 6
 
     @staticmethod
     def is_finished(status: "SequenceStatus") -> bool:
-        return status in [
-            SequenceStatus.FINISHED_STOPPED,
-            SequenceStatus.FINISHED_LENGTH_CAPPED,
-            SequenceStatus.FINISHED_ABORTED,
-            SequenceStatus.FINISHED_IGNORED,
-        ]
+        return status > SequenceStatus.SWAPPED
 
     @staticmethod
     def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
-- 
GitLab


From f7dac83d95ae38973b425a8bb2d3a3df9fe9a9c2 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Sat, 29 Jun 2024 06:04:20 -0700
Subject: [PATCH 206/376] [Kernel] Raise an exception in MoE kernel if the
 batch size is larger then 65k (#5939)

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b750fc713..ecab77a8b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -423,6 +423,11 @@ def fused_experts(hidden_states: torch.Tensor,
     M, _ = hidden_states.shape
     E, N, _ = w1.shape
 
+    if M > 65536:
+        # https://github.com/vllm-project/vllm/issues/5938
+        raise ValueError("MoE kernel does not support more than 65536 tokens, "
+                         f"but got {M}")
+
     if override_config:
         config = override_config
     else:
-- 
GitLab


From 8dbfcd35bf2313dedc9e947a991b6e0044248589 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 29 Jun 2024 09:12:58 -0400
Subject: [PATCH 207/376] [ CI/Build ] Added E2E Test For Compressed Tensors
 (#5839)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
---
 requirements-test.txt                         |  2 +
 tests/conftest.py                             |  4 ++
 tests/models/test_compressed_tensors.py       | 49 +++++++++++++++++++
 .../compressed_tensors/compressed_tensors.py  |  3 +-
 4 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 tests/models/test_compressed_tensors.py

diff --git a/requirements-test.txt b/requirements-test.txt
index 3ebfc1654..a7604d2e1 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,6 +14,8 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
+sparseml==1.8.0 # required for compressed-tensors
+compressed-tensors==0.4.0 # required for compressed-tensors
 
 # Benchmarking
 aiohttp
diff --git a/tests/conftest.py b/tests/conftest.py
index 9d00c7676..b429d8d0b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -176,6 +176,7 @@ class HfRunner:
         model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
+        is_sparseml_model: bool = False,
     ) -> None:
         assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -193,6 +194,9 @@ class HfRunner:
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
+            elif is_sparseml_model:
+                from sparseml.transformers import SparseAutoModelForCausalLM
+                auto_cls = SparseAutoModelForCausalLM
             else:
                 auto_cls = AutoModelForCausalLM
 
diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
new file mode 100644
index 000000000..9a0054c5a
--- /dev/null
+++ b/tests/models/test_compressed_tensors.py
@@ -0,0 +1,49 @@
+"""Compares vllm vs sparseml for compressed-tensors
+
+Note: vllm and sparseml do not have bitwise correctness, 
+so in this test, we just confirm that the top selected 
+tokens of the are in the top 5 selections of each other.
+"""
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+from .utils import check_logprobs_close
+
+MODELS = [
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
+]
+
+MAX_TOKENS = 32
+NUM_LOGPROBS = 5
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("compressed-tensors"),
+    reason="compressed-tensors is not supported on this machine type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(
+    vllm_runner,
+    hf_runner,
+    example_prompts,
+    model_name,
+) -> None:
+    # Run sparseml.
+    with hf_runner(model_name=model_name,
+                   is_sparseml_model=True) as sparseml_model:
+
+        sparseml_outputs = sparseml_model.generate_greedy_logprobs_limit(
+            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
+
+    # Run vllm.
+    with vllm_runner(model_name=model_name) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=sparseml_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="sparseml",
+        name_1="vllm",
+    )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index c69e2f3bc..0cf224cc0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -34,7 +34,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         return [torch.float16, torch.bfloat16]
 
     # Need to figure it out
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         return 60
 
     def get_name(self) -> str:
-- 
GitLab


From 99397da5349226c553debfd37469a6de724d3f24 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 23:45:54 +0800
Subject: [PATCH 208/376] [CI/Build] Add TP test for vision models (#5892)

---
 .buildkite/test-pipeline.yaml                 |  5 ++
 .../distributed/test_multimodal_broadcast.py  | 51 +++++++++++++++++++
 tests/models/test_llava.py                    | 39 +++++++++++---
 tests/models/test_phi3v.py                    | 49 +++++++++++++-----
 .../device_communicators/shm_broadcast.py     |  1 +
 vllm/distributed/parallel_state.py            |  4 +-
 vllm/model_executor/models/llava.py           |  2 +-
 vllm/model_executor/models/llava_next.py      |  2 +-
 vllm/model_executor/models/phi3v.py           |  5 +-
 9 files changed, 131 insertions(+), 27 deletions(-)
 create mode 100644 tests/distributed/test_multimodal_broadcast.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 023696f3c..0a0bb5567 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -44,6 +44,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
+  - bash ../.buildkite/download-images.sh
   # FIXIT: find out which code initialize cuda before running the test
   # before the fix, we need to use spawn to test it
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -52,10 +53,14 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
new file mode 100644
index 000000000..41c3fd9e7
--- /dev/null
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -0,0 +1,51 @@
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+The second test will hang if more than one test is run per command, so we need
+to run the tests one by one. The solution is to pass arguments (model name) by
+environment variables.
+
+Run:
+```sh
+TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
+    test_multimodal_broadcast.py
+TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
+    test_multimodal_broadcast.py
+```
+"""
+import os
+
+import pytest
+
+from vllm.utils import cuda_device_count_stateless
+
+model = os.environ["TEST_DIST_MODEL"]
+
+if model.startswith("llava-hf/llava"):
+    from ..models.test_llava import model_and_vl_config, run_test
+elif model.startswith("microsoft/Phi-3-vision"):
+    from ..models.test_phi3v import model_and_vl_config, run_test
+else:
+    raise NotImplementedError(f"Unsupported model: {model}")
+
+
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, image_assets,
+                tensor_parallel_size: int, dtype: str,
+                max_tokens: int) -> None:
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(
+            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+
+    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model_and_config=model_and_vl_config[0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index ac1d2ece6..f2dfd4bb8 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,11 +1,11 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
 
-from ..conftest import IMAGE_ASSETS
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 
 pytestmark = pytest.mark.vlm
 
@@ -65,12 +65,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     return hf_output_ids, hf_output_str
 
 
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int) -> None:
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model_and_config: Tuple[str, VisionLanguageConfig],
+    *,
+    dtype: str,
+    max_tokens: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -96,6 +101,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
 
     with vllm_runner(model_id,
                      dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
@@ -110,3 +117,19 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model_and_config,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 03c130466..e7d563949 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
@@ -6,7 +6,7 @@ from transformers import AutoTokenizer
 from vllm.config import VisionLanguageConfig
 from vllm.utils import is_cpu
 
-from ..conftest import IMAGE_ASSETS
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 
 pytestmark = pytest.mark.vlm
 
@@ -73,17 +73,17 @@ if is_cpu():
     target_dtype = "bfloat16"
 
 
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-# Since we use _attn_implementation="eager" for hf_runner, here is
-# numeric difference for longer context and test can't pass
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int) -> None:
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model_and_config: Tuple[str, VisionLanguageConfig],
+    *,
+    dtype: str,
+    max_tokens: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -116,7 +116,9 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     with vllm_runner(model_id,
                      max_model_len=2048,
                      dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
                      enforce_eager=True,
+                     distributed_executor_backend=distributed_executor_backend,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                                   max_tokens,
@@ -130,3 +132,24 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+# Since we use _attn_implementation="eager" for hf_runner, here is
+# numeric difference for longer context and test can't pass
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model_and_config,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 550271f88..bea205882 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -268,6 +268,7 @@ class ShmRingBufferIO:
         else:
             return self.dequeue()
 
+    @staticmethod
     def create_from_process_group(pg: ProcessGroup,
                                   max_chunk_bytes,
                                   max_chunks,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 0c4ee0eb2..4ebb8703e 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -194,7 +194,7 @@ class GroupCoordinator:
         self.shm_broadcaster: Optional[ShmRingBufferIO] = None
         if self.world_size > 1 and is_in_the_same_node(self.cpu_group):
             self.shm_broadcaster = ShmRingBufferIO.create_from_process_group(
-                self.cpu_group, 1 << 20, 6)
+                self.cpu_group, 1 << 22, 6)
 
     @property
     def first_rank(self):
@@ -690,6 +690,8 @@ class GroupCoordinator:
             self.pynccl_comm = None
         if self.ca_comm is not None:
             self.ca_comm = None
+        if self.shm_broadcaster is not None:
+            self.shm_broadcaster = None
 
 
 _WORLD: Optional[GroupCoordinator] = None
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index bdcb63317..ba4496f9c 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -219,7 +219,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values.to(vision_tower.device),
+        image_features = vision_tower(pixel_values,
                                       self.config.vision_feature_layer)
 
         return self._select_image_features(
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index cebc82816..281431074 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -301,7 +301,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values.to(vision_tower.device),
+        image_features = vision_tower(pixel_values,
                                       self.config.vision_feature_layer)
 
         return self._select_image_features(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5d8ffd521..bc3d3f0fb 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -157,7 +157,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
 
         select = False
 
-        target_device = self.img_projection[0].bias.device
         target_dtype = self.img_projection[0].bias.dtype
 
         if len(positions.tolist()) > 0:
@@ -231,7 +230,7 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
             img_set_tensor = []
             for _output_img in output_imgs:
                 img_feature_proj = self.img_projection(
-                    _output_img.to(target_device, target_dtype))
+                    _output_img.to(target_dtype))
                 img_set_tensor.append(img_feature_proj)
             select = True
 
@@ -245,7 +244,7 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
                 hidden_states[positions[idx, 0],
                               positions[idx, 1]:positions[idx, 1] +
                               cnt] = (img_set_tensor[i].to(
-                                  hidden_states.device, hidden_states.dtype))
+                                  hidden_states.dtype))
                 idx += cnt
 
         return hidden_states.squeeze(0)
-- 
GitLab


From 75aa1442dbc76c43804e8dd528eff1aae3b45d1e Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 29 Jun 2024 13:04:30 -0400
Subject: [PATCH 209/376] [ CI/Build ] LM Eval Harness Based CI Testing (#5838)

Co-authored-by: Robert Shaw <rshaw@neuralmagic>
---
 .../configs/Meta-Llama-3-70B-Instruct.yaml    | 11 ++++
 .../configs/Meta-Llama-3-8B-Instruct-FP8.yaml | 11 ++++
 .../configs/Meta-Llama-3-8B-Instruct.yaml     | 11 ++++
 .../configs/Mixtral-8x7B-Instruct-v0.1.yaml   | 11 ++++
 .../lm-eval-harness/configs/models-large.txt  |  2 +
 .../lm-eval-harness/configs/models-small.txt  |  2 +
 .../run-lm-eval-gsm-hf-baseline.sh            | 46 +++++++++++++++
 .../run-lm-eval-gsm-vllm-baseline.sh          | 51 ++++++++++++++++
 .buildkite/lm-eval-harness/run-tests.sh       | 59 +++++++++++++++++++
 .../test_lm_eval_correctness.py               | 54 +++++++++++++++++
 .buildkite/test-pipeline.yaml                 | 16 +++++
 11 files changed, 274 insertions(+)
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/models-large.txt
 create mode 100644 .buildkite/lm-eval-harness/configs/models-small.txt
 create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
 create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
 create mode 100644 .buildkite/lm-eval-harness/run-tests.sh
 create mode 100644 .buildkite/lm-eval-harness/test_lm_eval_correctness.py

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
new file mode 100644
index 000000000..fa6ea236e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.892
+  - name: "exact_match,flexible-extract"
+    value: 0.892
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
new file mode 100644
index 000000000..02668702b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.756
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
new file mode 100644
index 000000000..fb4b4915a
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
+model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.756
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
new file mode 100644
index 000000000..dec9164d1
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.632
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
new file mode 100644
index 000000000..127ec5d97
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -0,0 +1,2 @@
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
new file mode 100644
index 000000000..273c5482d
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -0,0 +1,2 @@
+Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3-8B-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
new file mode 100644
index 000000000..fdb8ec539
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo
+}
+
+while getopts "m:b:l:f:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model hf \
+  --model_args pretrained=$MODEL,parallelize=True \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
new file mode 100644
index 000000000..a2876bade
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.2
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh
new file mode 100644
index 000000000..b4fdde6da
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm and compares to "
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
+    echo "  -t    - tensor parallel size"
+    echo
+}
+
+SUCCESS=0
+
+while getopts "c:t:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+    
+    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
+
+    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
+    export LM_EVAL_TP_SIZE=$TP_SIZE
+    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
new file mode 100644
index 000000000..975841dad
--- /dev/null
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -0,0 +1,54 @@
+"""
+LM eval harness on model to compare vs HF baseline computed offline.
+Configs are found in configs/$MODEL.yaml
+
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
+* export LM_EVAL_TP_SIZE=4 
+* pytest -s test_lm_eval_correctness.py
+"""
+
+import os
+from pathlib import Path
+
+import lm_eval
+import numpy
+import yaml
+
+RTOL = 0.02
+TEST_DATA_FILE = os.environ.get(
+    "LM_EVAL_TEST_DATA_FILE",
+    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
+
+TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
+
+
+def launch_lm_eval(eval_config):
+    model_args = f"pretrained={eval_config['model_name']}," \
+                 f"tensor_parallel_size={TP_SIZE}"
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=[task["name"] for task in eval_config["tasks"]],
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+        batch_size="auto")
+
+    return results
+
+
+def test_lm_eval_correctness():
+    eval_config = yaml.safe_load(
+        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+
+    # Launch eval requests.
+    results = launch_lm_eval(eval_config)
+
+    # Confirm scores match ground truth.
+    for task in eval_config["tasks"]:
+        for metric in task["metrics"]:
+            ground_truth = metric["value"]
+            measured_value = results["results"][task["name"]][metric["name"]]
+            print(f'{task["name"]} | {metric["name"]}: '
+                  f'ground_truth={ground_truth} | measured={measured_value}')
+            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0a0bb5567..6ed3b5be5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -197,6 +197,22 @@ steps:
   - pip install aiohttp
   - bash run-benchmarks.sh
 
+- label: LM Eval Small Models
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+
+- label: LM Eval Large Models
+  gpu: a100
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+
 - label: Documentation Build
   working_dir: "/vllm-workspace/test_docs/docs"
   no_gpu: True
-- 
GitLab


From 9def10664e8b54dcc5c6114f2895bc9e712bf182 Mon Sep 17 00:00:00 2001
From: Matt Wong <156021403+mawong-amd@users.noreply.github.com>
Date: Sat, 29 Jun 2024 14:47:58 -0500
Subject: [PATCH 210/376] [Bugfix][CI/Build][Hardware][AMD] Install matching
 torchvision to fix AMD tests (#5949)

---
 Dockerfile.rocm                       | 18 ++++++++++++------
 tests/entrypoints/test_openai_chat.py |  4 ++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 652f04adf..1b89b892b 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -55,16 +55,22 @@ RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
 # Install torch == 2.4.0 on ROCm
 RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
         *"rocm-5.7"*) \
-            pip uninstall -y torch \
-            && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
+                torchvision==0.19.0.dev20240612 \
                --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
         *"rocm-6.0"*) \
-            pip uninstall -y torch \
-            && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
+                torchvision==0.19.0.dev20240612 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
         *"rocm-6.1"*) \
-            pip uninstall -y torch \
-            && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
+                torchvision==0.19.0.dev20240612 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
         *) ;; esac
 
diff --git a/tests/entrypoints/test_openai_chat.py b/tests/entrypoints/test_openai_chat.py
index 1c46a5110..52e647170 100644
--- a/tests/entrypoints/test_openai_chat.py
+++ b/tests/entrypoints/test_openai_chat.py
@@ -14,7 +14,7 @@ import torch
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -79,7 +79,7 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    ray.init()
     yield
     ray.shutdown()
 
-- 
GitLab


From bcc6a09b63aeb3efce964b54a756d431e580aebc Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 29 Jun 2024 18:18:31 -0700
Subject: [PATCH 211/376] [CI/Build] Temporarily Remove Phi3-Vision from TP
 Test (#5989)

---
 .buildkite/test-pipeline.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6ed3b5be5..307ada611 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -60,7 +60,8 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  # FIXIT: find out why TP is failing with mp backend on phi3-v 
+  # - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
@@ -234,4 +235,4 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
\ No newline at end of file
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-- 
GitLab


From cff6a1fec15dba524c162d20a4e8de4df2b0a3d5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 11:44:25 +0800
Subject: [PATCH 212/376] [CI/Build] Reuse code for checking output consistency
 (#5988)

---
 .../test_basic_correctness.py                 | 15 +++----
 .../basic_correctness/test_chunked_prefill.py | 15 +++----
 tests/basic_correctness/test_preemption.py    | 16 ++++----
 .../test_basic_distributed_correctness.py     | 15 +++----
 .../test_chunked_prefill_distributed.py       | 15 +++----
 tests/models/test_big_models.py               | 15 +++----
 tests/models/test_llava.py                    | 18 +++++----
 tests/models/test_llava_next.py               | 18 +++++----
 tests/models/test_models.py                   | 15 +++----
 tests/models/test_phi3v.py                    | 18 +++++----
 tests/models/utils.py                         | 40 ++++++++++++++++++-
 11 files changed, 125 insertions(+), 75 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 6f44030fe..a7b0fef53 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -8,6 +8,8 @@ import pytest
 
 from vllm import LLM
 
+from ..models.utils import check_outputs_equal
+
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
@@ -46,10 +48,9 @@ def test_models(
                      gpu_memory_utilization=0.7) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 48d609128..767e06287 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -8,6 +8,8 @@ Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import pytest
 
+from ..models.utils import check_outputs_equal
+
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
@@ -54,10 +56,9 @@ def test_models(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 7f20b2d93..d60cc95d7 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -12,6 +12,8 @@ from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                  ENABLE_ARTIFICIAL_PREEMPT)
 
+from ..models.utils import check_outputs_equal
+
 MODELS = [
     "facebook/opt-125m",
 ]
@@ -94,13 +96,13 @@ def test_preemption(
         total_preemption = (
             vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
     assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
             "is not enough KV cache space." in caplog_vllm.text)
     # Ensure the count bucket of request-level histogram metrics matches
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index b8ae5b4c4..1f5fff3e1 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -17,6 +17,8 @@ import os
 import pytest
 import torch
 
+from ..models.utils import check_outputs_equal
+
 MODELS = [
     os.environ["TEST_DIST_MODEL"],
 ]
@@ -48,10 +50,9 @@ def test_models(
                      ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index 4e4e468c4..fd89147ac 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -16,6 +16,8 @@ import os
 import pytest
 import torch
 
+from ..models.utils import check_outputs_equal
+
 MODELS = [
     os.environ["TEST_DIST_MODEL"],
 ]
@@ -59,10 +61,9 @@ def test_models(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index ef7828373..c3e48b56e 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -7,6 +7,8 @@ Run `pytest tests/models/test_big_models.py`.
 import pytest
 import torch
 
+from .utils import check_outputs_equal
+
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
     # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
@@ -40,13 +42,12 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index f2dfd4bb8..c60b15afc 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -6,6 +6,7 @@ from transformers import AutoTokenizer
 from vllm.config import VisionLanguageConfig
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_outputs_equal
 
 pytestmark = pytest.mark.vlm
 
@@ -109,14 +110,15 @@ def run_test(
                                                   max_tokens,
                                                   images=vllm_images)
 
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        hf_outputs,
+        [
+            vllm_to_hf_output(vllm_output, vlm_config, model_id)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index d36e50387..940d5035e 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -6,6 +6,7 @@ from transformers import AutoTokenizer
 from vllm.config import VisionLanguageConfig
 
 from ..conftest import IMAGE_ASSETS
+from .utils import check_outputs_equal
 
 pytestmark = pytest.mark.vlm
 
@@ -115,11 +116,12 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                                                   max_tokens,
                                                   images=vllm_images)
 
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        hf_outputs,
+        [
+            vllm_to_hf_output(vllm_output, vlm_config, model_id)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 4453b4b9f..4cd2cb665 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -7,6 +7,8 @@ Run `pytest tests/models/test_models.py`.
 """
 import pytest
 
+from .utils import check_outputs_equal
+
 MODELS = [
     "facebook/opt-125m",
     "gpt2",
@@ -41,13 +43,12 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index e7d563949..2e34fa8c1 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -7,6 +7,7 @@ from vllm.config import VisionLanguageConfig
 from vllm.utils import is_cpu
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_outputs_equal
 
 pytestmark = pytest.mark.vlm
 
@@ -124,14 +125,15 @@ def run_test(
                                                   max_tokens,
                                                   images=vllm_images)
 
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_outputs_equal(
+        hf_outputs,
+        [
+            vllm_to_hf_output(vllm_output, vlm_config, model_id)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 # Since we use _attn_implementation="eager" for hf_runner, here is
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 3e49dfb33..0d5e304d8 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,7 +1,43 @@
-def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
-    """Compare the logprobs of two sequences generated by different models, 
+from typing import Dict, List, Tuple
+
+TokensText = Tuple[List[int], str]
+
+
+def check_outputs_equal(outputs_0_lst: List[TokensText],
+                        outputs_1_lst: List[TokensText], name_0: str,
+                        name_1: str):
+    """
+    Compare the two sequences generated by different models, 
+    which should be equal.
+    """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        output_ids_0, output_str_0 = outputs_0
+        output_ids_1, output_str_1 = outputs_1
+
+        assert output_str_0 == output_str_1, (f"Test{prompt_idx}:"
+                                              f"\n{name_0}:\t{output_str_0!r}"
+                                              f"\n{name_1}:\t{output_str_1!r}")
+        assert output_ids_0 == output_ids_1, (f"Test{prompt_idx}:"
+                                              f"\n{name_0}:\t{output_str_0!r}"
+                                              f"\n{name_1}:\t{output_str_1!r}")
+
+
+TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
+
+
+def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
+                         outputs_1_lst: List[TokensTextLogprobs], name_0: str,
+                         name_1: str):
+    """
+    Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.
     """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
     # Loop through responses to each prompt.
     for prompt_idx, (outputs_0,
                      outputs_1) in enumerate(zip(outputs_0_lst,
-- 
GitLab


From 9d47f64eb6f35e42840b1e4ca6dc68167014abcd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 12:58:49 +0800
Subject: [PATCH 213/376] [CI/Build] [3/3] Reorganize entrypoints tests (#5966)

---
 .buildkite/test-pipeline.yaml                       |  4 ++--
 pyproject.toml                                      |  2 --
 tests/entrypoints/llm/__init__.py                   |  0
 .../{test_llm_encode.py => llm/test_encode.py}      |  4 +---
 .../{test_llm_generate.py => llm/test_generate.py}  |  4 +---
 .../test_generate_multiple_loras.py}                |  4 +---
 tests/entrypoints/openai/__init__.py                |  0
 .../{test_openai_chat.py => openai/test_chat.py}    |  4 +---
 .../test_completion.py}                             |  4 +---
 .../test_embedding.py}                              |  4 +---
 .../{ => openai}/test_guided_processors.py          |  2 --
 .../test_models.py}                                 |  4 +---
 .../test_oot_registration.py}                       |  3 ---
 .../test_run_batch.py}                              |  0
 tests/entrypoints/openai/test_serving_chat.py       |  4 ----
 .../test_vision.py}                                 | 13 +++----------
 tests/utils.py                                      | 11 +++++++----
 17 files changed, 19 insertions(+), 48 deletions(-)
 create mode 100644 tests/entrypoints/llm/__init__.py
 rename tests/entrypoints/{test_llm_encode.py => llm/test_encode.py} (98%)
 rename tests/entrypoints/{test_llm_generate.py => llm/test_generate.py} (98%)
 rename tests/entrypoints/{test_llm_generate_multiple_loras.py => llm/test_generate_multiple_loras.py} (96%)
 create mode 100644 tests/entrypoints/openai/__init__.py
 rename tests/entrypoints/{test_openai_chat.py => openai/test_chat.py} (99%)
 rename tests/entrypoints/{test_openai_completion.py => openai/test_completion.py} (99%)
 rename tests/entrypoints/{test_openai_embedding.py => openai/test_embedding.py} (97%)
 rename tests/entrypoints/{ => openai}/test_guided_processors.py (99%)
 rename tests/entrypoints/{test_openai_server.py => openai/test_models.py} (96%)
 rename tests/entrypoints/{test_server_oot_registration.py => openai/test_oot_registration.py} (97%)
 rename tests/entrypoints/{test_openai_run_batch.py => openai/test_run_batch.py} (100%)
 rename tests/entrypoints/{test_openai_vision.py => openai/test_vision.py} (96%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 307ada611..c102a5321 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -89,8 +89,8 @@ steps:
   mirror_hardwares: [amd]
 
   commands:
-  - pytest -v -s entrypoints -m llm
-  - pytest -v -s entrypoints -m openai
+  - pytest -v -s entrypoints/llm
+  - pytest -v -s entrypoints/openai
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
diff --git a/pyproject.toml b/pyproject.toml
index 4958aae02..790e01362 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,7 +69,5 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
     "skip_global_cleanup",
-    "llm: run tests for vLLM API only",
-    "openai: run tests for OpenAI API only",
     "vlm: run tests for vision language models only",
 ]
diff --git a/tests/entrypoints/llm/__init__.py b/tests/entrypoints/llm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/entrypoints/test_llm_encode.py b/tests/entrypoints/llm/test_encode.py
similarity index 98%
rename from tests/entrypoints/test_llm_encode.py
rename to tests/entrypoints/llm/test_encode.py
index 7c3fbe43a..d1056a049 100644
--- a/tests/entrypoints/test_llm_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -5,7 +5,7 @@ import pytest
 
 from vllm import LLM, EmbeddingRequestOutput, PoolingParams
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
@@ -25,8 +25,6 @@ TOKEN_IDS = [
     [1000, 1003, 1001, 1002],
 ]
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/llm/test_generate.py
similarity index 98%
rename from tests/entrypoints/test_llm_generate.py
rename to tests/entrypoints/llm/test_generate.py
index a00fff91a..57ac37f7e 100644
--- a/tests/entrypoints/test_llm_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -5,7 +5,7 @@ import pytest
 
 from vllm import LLM, RequestOutput, SamplingParams
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 MODEL_NAME = "facebook/opt-125m"
 
@@ -23,8 +23,6 @@ TOKEN_IDS = [
     [0, 3, 1, 2],
 ]
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
diff --git a/tests/entrypoints/test_llm_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
similarity index 96%
rename from tests/entrypoints/test_llm_generate_multiple_loras.py
rename to tests/entrypoints/llm/test_generate_multiple_loras.py
index 176daa472..35eabf079 100644
--- a/tests/entrypoints/test_llm_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download
 from vllm import LLM
 from vllm.lora.request import LoRARequest
 
-from ..conftest import cleanup
+from ...conftest import cleanup
 
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
@@ -20,8 +20,6 @@ PROMPTS = [
 
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-pytestmark = pytest.mark.llm
-
 
 @pytest.fixture(scope="module")
 def llm():
diff --git a/tests/entrypoints/openai/__init__.py b/tests/entrypoints/openai/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/entrypoints/test_openai_chat.py b/tests/entrypoints/openai/test_chat.py
similarity index 99%
rename from tests/entrypoints/test_openai_chat.py
rename to tests/entrypoints/openai/test_chat.py
index 52e647170..f4c0af1ad 100644
--- a/tests/entrypoints/test_openai_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -14,7 +14,7 @@ import torch
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from ..utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -69,8 +69,6 @@ TEST_CHOICE = [
     "Swift", "Kotlin"
 ]
 
-pytestmark = pytest.mark.openai
-
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
diff --git a/tests/entrypoints/test_openai_completion.py b/tests/entrypoints/openai/test_completion.py
similarity index 99%
rename from tests/entrypoints/test_openai_completion.py
rename to tests/entrypoints/openai/test_completion.py
index da5de3666..b05035713 100644
--- a/tests/entrypoints/test_openai_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -16,7 +16,7 @@ from openai import BadRequestError
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ..utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -71,8 +71,6 @@ TEST_CHOICE = [
     "Swift", "Kotlin"
 ]
 
-pytestmark = pytest.mark.openai
-
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/openai/test_embedding.py
similarity index 97%
rename from tests/entrypoints/test_openai_embedding.py
rename to tests/entrypoints/openai/test_embedding.py
index 45f701733..82a5627aa 100644
--- a/tests/entrypoints/test_openai_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -2,12 +2,10 @@ import openai
 import pytest
 import ray
 
-from ..utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
-pytestmark = pytest.mark.openai
-
 
 @pytest.fixture(scope="module")
 def ray_ctx():
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/openai/test_guided_processors.py
similarity index 99%
rename from tests/entrypoints/test_guided_processors.py
rename to tests/entrypoints/openai/test_guided_processors.py
index fb32a9d15..27568d3e7 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/openai/test_guided_processors.py
@@ -52,8 +52,6 @@ TEST_SCHEMA = {
 TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
               r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
 
-pytestmark = pytest.mark.openai
-
 
 def test_guided_logits_processors():
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/openai/test_models.py
similarity index 96%
rename from tests/entrypoints/test_openai_server.py
rename to tests/entrypoints/openai/test_models.py
index ef0d30131..fddfd7550 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -6,7 +6,7 @@ import ray
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
-from ..utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -14,8 +14,6 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-pytestmark = pytest.mark.openai
-
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
similarity index 97%
rename from tests/entrypoints/test_server_oot_registration.py
rename to tests/entrypoints/openai/test_oot_registration.py
index 3e55d7f42..dbbda6de1 100644
--- a/tests/entrypoints/test_server_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -1,7 +1,6 @@
 import sys
 import time
 
-import pytest
 import torch
 from openai import OpenAI, OpenAIError
 
@@ -10,8 +9,6 @@ from vllm.model_executor.models.opt import OPTForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.utils import get_open_port
 
-pytestmark = pytest.mark.openai
-
 
 class MyOPTForCausalLM(OPTForCausalLM):
 
diff --git a/tests/entrypoints/test_openai_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
similarity index 100%
rename from tests/entrypoints/test_openai_run_batch.py
rename to tests/entrypoints/openai/test_run_batch.py
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index c45f02fe5..74b497267 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,15 +1,11 @@
 import asyncio
 from dataclasses import dataclass
 
-import pytest
-
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
 
-pytestmark = pytest.mark.openai
-
 
 @dataclass
 class MockModelConfig:
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/openai/test_vision.py
similarity index 96%
rename from tests/entrypoints/test_openai_vision.py
rename to tests/entrypoints/openai/test_vision.py
index df092680a..dbaaa349a 100644
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,4 +1,3 @@
-from pathlib import Path
 from typing import Dict, List
 
 import openai
@@ -8,12 +7,12 @@ import ray
 
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
 
-from ..utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
-                       "examples/template_llava.jinja")
+LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja"
 assert LLAVA_CHAT_TEMPLATE.exists()
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -22,8 +21,6 @@ TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
-pytestmark = pytest.mark.openai
-
 
 @pytest.fixture(scope="module")
 def ray_ctx():
@@ -279,7 +276,3 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
     )
     completion = completion.choices[0].text
     assert completion is not None and len(completion) >= 0
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/utils.py b/tests/utils.py
index 2a5f82b91..09107b5e7 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,7 +4,8 @@ import sys
 import time
 import warnings
 from contextlib import contextmanager
-from typing import Dict, List
+from pathlib import Path
+from typing import Any, Dict, List
 
 import openai
 import ray
@@ -40,8 +41,8 @@ else:
             nvmlShutdown()
 
 
-# Path to root of repository so that utilities can be imported by ray workers
-VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
+VLLM_PATH = Path(__file__).parent.parent
+"""Path to root of the vLLM repository."""
 
 
 class RemoteOpenAIServer:
@@ -153,10 +154,12 @@ def init_test_distributed_environment(
 def multi_process_parallel(
     tp_size: int,
     pp_size: int,
-    test_target,
+    test_target: Any,
 ) -> None:
     # Using ray helps debugging the error when it failed
     # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
     ray.init(runtime_env={"working_dir": VLLM_PATH})
 
     distributed_init_port = get_open_port()
-- 
GitLab


From 2be6955a3fd596b33be92a2927f55ee0779a4690 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 30 Jun 2024 01:06:13 -0700
Subject: [PATCH 214/376] [ci][distributed] fix device count call

[ci][distributed] fix some cuda init that makes it necessary to use spawn (#5991)
---
 .buildkite/test-pipeline.yaml                 | 12 +-----
 tests/conftest.py                             | 21 +++++++---
 .../test_basic_distributed_correctness.py     | 15 ++++---
 .../test_chunked_prefill_distributed.py       | 14 +++++--
 tests/models/test_llava.py                    | 30 ++++++++-----
 tests/models/test_phi3v.py                    | 42 ++++++++++++-------
 6 files changed, 83 insertions(+), 51 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c102a5321..6931659db 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -45,9 +45,6 @@ steps:
   num_gpus: 2
   commands:
   - bash ../.buildkite/download-images.sh
-  # FIXIT: find out which code initialize cuda before running the test
-  # before the fix, we need to use spawn to test it
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -60,8 +57,7 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  # FIXIT: find out why TP is failing with mp backend on phi3-v 
-  # - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
@@ -71,9 +67,6 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
-  # FIXIT: find out which code initialize cuda before running the test
-  # before the fix, we need to use spawn to test it
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
@@ -225,9 +218,6 @@ steps:
   gpu: a100
   num_gpus: 4
   commands: 
-  # FIXIT: find out which code initialize cuda before running the test
-  # before the fix, we need to use spawn to test it
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
diff --git a/tests/conftest.py b/tests/conftest.py
index b429d8d0b..0bd24905e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,8 @@ from collections import UserList
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
-                    TypeVar)
+from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple,
+                    TypedDict, TypeVar)
 
 import pytest
 import torch
@@ -14,7 +14,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoProcessor, AutoTokenizer, BatchEncoding)
+                          AutoTokenizer, BatchEncoding)
 
 from vllm import LLM, SamplingParams
 from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
@@ -22,8 +22,12 @@ from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalData
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalData
+else:
+    # it will call torch.cuda.device_count()
+    MultiModalData = None
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
@@ -63,6 +67,10 @@ class ImageAsset:
         return self.pil_image
 
     def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
+        # don't put this import at the top level
+        # it will call torch.cuda.device_count()
+        from vllm.multimodal.image import ImageFeatureData  # noqa: F401
+        from vllm.multimodal.image import ImagePixelData
         image_input_type = vision_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
@@ -216,6 +224,9 @@ class HfRunner:
         )
 
         try:
+            # don't put this import at the top level
+            # it will call torch.cuda.device_count()
+            from transformers import AutoProcessor  # noqa: F401
             self.processor = AutoProcessor.from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 1f5fff3e1..7a0e5673b 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -15,7 +15,8 @@ TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
 import os
 
 import pytest
-import torch
+
+from vllm.utils import cuda_device_count_stateless
 
 from ..models.utils import check_outputs_equal
 
@@ -25,7 +26,7 @@ MODELS = [
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
@@ -40,9 +41,10 @@ def test_models(
 ) -> None:
     distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
                      dtype=dtype,
                      tensor_parallel_size=2,
@@ -50,6 +52,9 @@ def test_models(
                      ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index fd89147ac..1ef085b93 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -14,7 +14,8 @@ TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
 import os
 
 import pytest
-import torch
+
+from vllm.utils import cuda_device_count_stateless
 
 from ..models.utils import check_outputs_equal
 
@@ -24,7 +25,7 @@ MODELS = [
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
@@ -47,8 +48,10 @@ def test_models(
     enable_chunked_prefill = True
     max_num_batched_tokens = chunked_prefill_token_size
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
 
     with vllm_runner(
             model,
@@ -61,6 +64,9 @@ def test_models(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index c60b15afc..b4220dc59 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -88,17 +88,11 @@ def run_test(
     """
     model_id, vlm_config = model_and_config
     hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
-
-    vllm_image_prompts = [
-        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
-        for p in HF_IMAGE_PROMPTS
-    ]
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
 
     with vllm_runner(model_id,
                      dtype=dtype,
@@ -106,10 +100,26 @@ def run_test(
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
+
+        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
+        # we must put it inside the vllm_runner context manager
+        # i.e. after creating vLLM instance.
+        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+
+        vllm_image_prompts = [
+            p.replace("<image>", "<image>" * vlm_config.image_feature_size)
+            for p in HF_IMAGE_PROMPTS
+        ]
+
         vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                                   max_tokens,
                                                   images=vllm_images)
 
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
+
     check_outputs_equal(
         hf_outputs,
         [
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 2e34fa8c1..ba71763f9 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -96,23 +96,11 @@ def run_test(
     """
     model_id, vlm_config = model_and_config
     hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
-    hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model_id, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(
-            HF_IMAGE_PROMPTS,
-            max_tokens,
-            images=hf_images,
-            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
-
-    vllm_image_prompts = [
-        p.replace("<|image_1|>",
-                  "<|image|>" * vlm_config.image_feature_size + "<s>")
-        for p in HF_IMAGE_PROMPTS
-    ]
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
 
     with vllm_runner(model_id,
                      max_model_len=2048,
@@ -121,10 +109,32 @@ def run_test(
                      enforce_eager=True,
                      distributed_executor_backend=distributed_executor_backend,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
+        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
+        # we must put it inside the vllm_runner context manager
+        # i.e. after creating vLLM instance.
+
+        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+
+        vllm_image_prompts = [
+            p.replace("<|image_1|>",
+                      "<|image|>" * vlm_config.image_feature_size + "<s>")
+            for p in HF_IMAGE_PROMPTS
+        ]
+
         vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                                   max_tokens,
                                                   images=vllm_images)
 
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model_id, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+        hf_outputs = hf_model.generate_greedy(
+            HF_IMAGE_PROMPTS,
+            max_tokens,
+            images=hf_images,
+            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
+
     check_outputs_equal(
         hf_outputs,
         [
-- 
GitLab


From c6c240aa0a2d4bed821282a07d50f6710cd99eed Mon Sep 17 00:00:00 2001
From: llmpros <10524065+llmpros@users.noreply.github.com>
Date: Sun, 30 Jun 2024 08:53:00 -0700
Subject: [PATCH 215/376] [Frontend]: Support base64 embedding (#5935)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/entrypoints/openai/test_embedding.py   | 33 ++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py          |  2 +-
 vllm/entrypoints/openai/serving_embedding.py | 26 +++++++--------
 3 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 82a5627aa..7c7232dbc 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -1,3 +1,6 @@
+import base64
+
+import numpy as np
 import openai
 import pytest
 import ray
@@ -109,3 +112,33 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
+                                      model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    responses_float = await embedding_client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float")
+
+    responses_base64 = await embedding_client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="base64")
+
+    decoded_responses_base64_data = []
+    for data in responses_base64.data:
+        decoded_responses_base64_data.append(
+            np.frombuffer(base64.b64decode(data.embedding),
+                          dtype="float").tolist())
+
+    assert responses_float.data[0].embedding == decoded_responses_base64_data[
+        0]
+    assert responses_float.data[1].embedding == decoded_responses_base64_data[
+        1]
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 0ad46cbea..d1568cb3a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -580,7 +580,7 @@ class CompletionStreamResponse(OpenAIBaseModel):
 class EmbeddingResponseData(BaseModel):
     index: int
     object: str = "embedding"
-    embedding: List[float]
+    embedding: Union[List[float], str]
 
 
 class EmbeddingResponse(BaseModel):
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index cbf09f173..4838cb7d0 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,6 +1,8 @@
+import base64
 import time
 from typing import AsyncIterator, List, Optional, Tuple
 
+import numpy as np
 from fastapi import Request
 
 from vllm.config import ModelConfig
@@ -20,19 +22,18 @@ TypeTokenIDs = List[int]
 
 
 def request_output_to_embedding_response(
-    final_res_batch: List[EmbeddingRequestOutput],
-    request_id: str,
-    created_time: int,
-    model_name: str,
-) -> EmbeddingResponse:
+        final_res_batch: List[EmbeddingRequestOutput], request_id: str,
+        created_time: int, model_name: str,
+        encoding_format: str) -> EmbeddingResponse:
     data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
         assert final_res is not None
         prompt_token_ids = final_res.prompt_token_ids
-
-        embedding_data = EmbeddingResponseData(
-            index=idx, embedding=final_res.outputs.embedding)
+        embedding = final_res.outputs.embedding
+        if encoding_format == "base64":
+            embedding = base64.b64encode(np.array(embedding))
+        embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
         data.append(embedding_data)
 
         num_prompt_tokens += len(prompt_token_ids)
@@ -72,10 +73,8 @@ class OpenAIServingEmbedding(OpenAIServing):
         if error_check_ret is not None:
             return error_check_ret
 
-        # Return error for unsupported features.
-        if request.encoding_format == "base64":
-            return self.create_error_response(
-                "base64 encoding is not currently supported")
+        encoding_format = (request.encoding_format
+                           if request.encoding_format else "float")
         if request.dimensions is not None:
             return self.create_error_response(
                 "dimensions is currently not supported")
@@ -129,7 +128,8 @@ class OpenAIServingEmbedding(OpenAIServing):
                     return self.create_error_response("Client disconnected")
                 final_res_batch[i] = res
             response = request_output_to_embedding_response(
-                final_res_batch, request_id, created_time, model_name)
+                final_res_batch, request_id, created_time, model_name,
+                encoding_format)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
-- 
GitLab


From f5e73c9f1bfcffdac4ec97f038443c053ee6fed8 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Mon, 1 Jul 2024 02:11:15 +0900
Subject: [PATCH 216/376] [Lora] Use safetensor keys instead of
 adapter_config.json to find unexpected modules.  (#5909)

Co-authored-by: sang <sangcho@anyscale.com>
---
 .buildkite/test-pipeline.yaml |  1 +
 tests/lora/conftest.py        |  4 ++-
 tests/lora/test_mixtral.py    |  4 +--
 vllm/lora/models.py           | 63 +++++++++++++++++++++++++----------
 4 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6931659db..d96e3c6d1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -226,3 +226,4 @@ steps:
   - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s -x lora/test_mixtral.py
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 4eab73a71..bda123bf1 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -165,7 +165,9 @@ def sql_lora_files():
 
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
-    return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
+    # Note: this module has incorrect adapter_config.json to test
+    # https://github.com/vllm-project/vllm/pull/5909/files.
+    return snapshot_download(repo_id="SangBinCho/mixtral-lora")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index e7e7724fc..b5b4a79eb 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -40,14 +40,14 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                    enable_lora=True,
                    max_num_seqs=16,
                    max_loras=4,
+                   distributed_executor_backend="ray",
                    tensor_parallel_size=tp_size)
 
     expected_lora_output = [
         "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
         "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
     ]
-
     assert do_sample(llm, mixtral_lora_files,
                      lora_id=1) == expected_lora_output
     assert do_sample(llm, mixtral_lora_files,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 0a1fc7c02..689835def 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -303,25 +303,54 @@ class LoRAModel:
                                                     "new_embeddings.bin")
         with open(lora_config_path) as f:
             config = json.load(f)
-        target_modules = config["target_modules"]
-        unexpected_modules = []
-        for module in target_modules:
-            # Compatible with more modules, such as:layers.11.self_attn.k_proj
-            part_name = module.split(".")[-1]
-            if part_name not in expected_lora_modules:
-                unexpected_modules.append(module)
-        # loaded lora's target modules must be a subset of expected_lora_modules
-
-        if unexpected_modules:
-            print(unexpected_modules, "modules")
-            raise ValueError(
-                f"While loading {lora_dir}, expected"
-                f" target modules in {expected_lora_modules}"
-                f" but received {unexpected_modules}."
-                f" Please verify that the loaded LoRA module is correct")
         if os.path.isfile(lora_tensor_path):
-            tensors = safetensors.torch.load_file(lora_tensor_path)
+            tensors: Dict[str, torch.Tensor] = {}
+            # Find unexpected modules.
+            # Use safetensor key as a source of truth to find expected modules.
+            # in peft if you have target_modules A, B, C and C does not exist
+            # in the model it won’t error and model will be trained with A, B
+            # loraified. C won’t exist in the safetensor but it will exist in
+            # the target_modules of the adapter_config.json.
+            unexpected_modules = []
+            with safetensors.safe_open(lora_tensor_path,
+                                       framework="pt") as f:  # type: ignore
+                for lora_module in f.keys():  # noqa
+                    module_name, _ = parse_fine_tuned_lora_name(lora_module)
+                    part_name = module_name.split(".")[-1]
+                    if part_name not in expected_lora_modules:
+                        unexpected_modules.append(module_name)
+                if unexpected_modules:
+                    raise ValueError(
+                        f"While loading {lora_dir}, expected"
+                        f" target modules in {expected_lora_modules}"
+                        f" but received {unexpected_modules}."
+                        f" Please verify that the loaded LoRA module is correct"
+                    )
+                # Load tensors if there are only expected modules.
+                for module in f.keys():  # noqa
+                    tensors[module] = f.get_tensor(module)
         elif os.path.isfile(lora_bin_file_path):
+            # When a bin file is provided, we rely on config to find unexpected
+            # modules.
+            unexpected_modules = []
+            target_modules = config["target_modules"]
+            for module in target_modules:
+                # Compatible with more modules,
+                # such as:layers.11.self_attn.k_proj
+                part_name = module.split(".")[-1]
+                if part_name not in expected_lora_modules:
+                    unexpected_modules.append(module)
+            # loaded lora's target modules must be a subset of
+            # expected_lora_modules. It is not reliable. See
+            # https://github.com/vllm-project/vllm/pull/5909. But there's no
+            # other better mechanism.
+            if unexpected_modules:
+                print(unexpected_modules, "modules")
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct")
             tensors = torch.load(lora_bin_file_path)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
-- 
GitLab


From deacb7ec44cd816648eb856959472f1fef01f883 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 30 Jun 2024 14:56:56 -0400
Subject: [PATCH 217/376] [ CI ] Temporarily Disable Large LM-Eval Tests
 (#6005)

Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic>
---
 .buildkite/test-pipeline.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d96e3c6d1..c51702886 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -198,15 +198,6 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
-- label: LM Eval Large Models
-  gpu: a100
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  commands:
-  - pip install lm-eval
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-large.txt -t 4
-
 - label: Documentation Build
   working_dir: "/vllm-workspace/test_docs/docs"
   no_gpu: True
-- 
GitLab


From 7836fdcc11aef8c4494a4470522c685c2190eddc Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Sun, 30 Jun 2024 16:15:16 -0400
Subject: [PATCH 218/376] [Misc] Fix `get_min_capability` (#5971)

---
 vllm/model_executor/layers/quantization/awq.py       |  3 ++-
 .../layers/quantization/base_config.py               |  3 ++-
 .../layers/quantization/bitsandbytes.py              |  2 +-
 .../compressed_tensors/compressed_tensors.py         | 12 ++++++++++--
 .../model_executor/layers/quantization/squeezellm.py |  3 ++-
 5 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index f4fc7ce02..a3854f70b 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -43,7 +43,8 @@ class AWQConfig(QuantizationConfig):
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.half]
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # The AWQ kernel only supports Turing or newer GPUs.
         return 75
 
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index e7de283b5..c23b66161 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -44,8 +44,9 @@ class QuantizationConfig(ABC):
         """List of supported activation dtypes."""
         raise NotImplementedError
 
+    @classmethod
     @abstractmethod
-    def get_min_capability(self) -> int:
+    def get_min_capability(cls) -> int:
         """Minimum GPU capability to support the quantization method.
 
         E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 969958d9b..e76714a7b 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -38,7 +38,7 @@ class BitsAndBytesConfig(QuantizationConfig):
         return [torch.float32, torch.float16, torch.bfloat16]
 
     @classmethod
-    def get_min_capability(self) -> int:
+    def get_min_capability(cls) -> int:
         return 70
 
     @staticmethod
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 0cf224cc0..a451427ec 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -33,10 +33,9 @@ class CompressedTensorsConfig(QuantizationConfig):
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
         return [torch.float16, torch.bfloat16]
 
-    # Need to figure it out
     @classmethod
     def get_min_capability(cls) -> int:
-        return 60
+        return 75
 
     def get_name(self) -> str:
         return "compressed_tensors"
@@ -84,6 +83,14 @@ class CompressedTensorsConfig(QuantizationConfig):
     def get_config_filenames(cls) -> List[str]:
         return []
 
+    def _check_gptq_and_marlin_can_run(self):
+        capability = torch.cuda.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        if capability < 80:
+            raise RuntimeError("The quantization config is not supported for ",
+                               "the current GPU. Minimum capability: 80. ",
+                               f"Current capability: {capability}.")
+
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
@@ -126,6 +133,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                     input_quant: BaseModel) -> "CompressedTensorsScheme":
 
         if self._is_wNa16_group_channel(weight_quant, input_quant):
+            self._check_gptq_and_marlin_can_run()
             if (self.quant_format == CompressionFormat.marlin_24.value
                     and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
                 return CompressedTensorsW4A16Sparse24(
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 207dbcee8..72ba55eb1 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -39,7 +39,8 @@ class SqueezeLLMConfig(QuantizationConfig):
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
         return [torch.half]
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         return 70
 
     @staticmethod
-- 
GitLab


From af9ad46fca6e594797b83e5ecb2e1f31ca5e9fac Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 30 Jun 2024 19:06:27 -0400
Subject: [PATCH 219/376] [ Misc ] Refactor w8a8 to use
 `process_weights_after_load` (Simplify Weight Loading) (#5940)

Co-authored-by: Robert Shaw <rshaw@neuralmagic>
---
 tests/quantization/test_compressed_tensors.py |  28 +++--
 tests/quantization/test_fp8.py                |  17 +++
 vllm/model_executor/layers/linear.py          | 110 +++++++-----------
 .../compressed_tensors/compressed_tensors.py  |   3 +
 .../schemes/compressed_tensors_scheme.py      |   8 ++
 .../schemes/compressed_tensors_unquantized.py |   3 +
 .../schemes/compressed_tensors_w4a16_24.py    |   3 +
 .../schemes/compressed_tensors_w8a8.py        |  91 +++++++--------
 .../schemes/compressed_tensors_wNa16.py       |   3 +
 .../model_executor/layers/quantization/fp8.py |  45 +++----
 10 files changed, 153 insertions(+), 158 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 6eb7ff72f..d5472f97a 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -11,14 +11,18 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
     CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
     CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationType)
 
 
 @pytest.mark.parametrize("model_args", [
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor"),
-    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel"),
+    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
+     QuantizationType.INT, 2560),
+    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
+     QuantizationType.INT, 2560),
 ])
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
-    model_path, strategy = model_args
+    model_path, strategy, quant_type, shape_0 = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -34,17 +38,23 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
                           CompressedTensorsLinearMethod)
         assert isinstance(down_proj.quant_method,
                           CompressedTensorsLinearMethod)
-
         assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
 
         assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.weight.dtype is torch.int8
-        assert o_proj.weight.dtype is torch.int8
-        assert gate_up_proj.weight.dtype is torch.int8
+        expected_type = (torch.int8 if quant_type == QuantizationType.INT else
+                         torch.float8_e4m3fn)
+
+        assert qkv_proj.weight.dtype is expected_type
+        assert o_proj.weight.dtype is expected_type
+        assert gate_up_proj.weight.dtype is expected_type
 
         if qkv_proj.scheme.strategy == "tensor":
-            assert qkv_proj.weight_scale.shard_splitter is not None
-            assert qkv_proj.weight_scale.logical_widths is not None
+            # Make sure it is a channelwise buffer
+            # After running process_weights_after_loading
+            assert len(qkv_proj.weight_scale.shape) == 2
+            assert qkv_proj.weight_scale.shape[0] == shape_0
+            assert qkv_proj.weight_scale.shape[1] == 1
+        assert qkv_proj.weight_scale.dtype is torch.float32
         assert qkv_proj.input_scale.dtype is torch.float32
 
 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 74d21ead0..4d76ae707 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -9,6 +9,23 @@ from tests.quantization.utils import is_quant_method_supported
 from vllm._custom_ops import scaled_fp8_quant
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 
+MODELS = [
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+def test_model_load_and_run(vllm_runner, model: str):
+    with vllm_runner(model) as llm:
+        # note: this does not test accuracy, just that we can run through
+        # see lm-eval tests for accuracy
+        outputs = llm.generate_greedy(prompts=["Hello my name is"],
+                                      max_tokens=10)
+        print(outputs[0][1])
+
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index d221fecd6..3cc257834 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -41,6 +41,29 @@ def adjust_bitsandbytes_shard(param: Parameter,
     return quantized_size, quantized_offset
 
 
+def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
+    """For fused modules (QKV and MLP) we have an array of length
+    N that holds 1 scale for each "logical" matrix. So the param
+    is an array of length N. The loaded_weight corresponds to 
+    one of the shards on disk. Here, we slice the param based on 
+    the shard_id for loading.
+    """
+    qkv_idxs = {"q": 0, "k": 1, "v": 2}
+
+    if isinstance(shard_id, str):
+        shard_id = qkv_idxs[shard_id]
+    elif not isinstance(shard_id, int):
+        raise ValueError(f"Unknown Shard Id {shard_id}")
+
+    # AutoFP8 scales do not have a shape
+    # compressed-tensors scales do have a shape
+    if len(loaded_weight.shape) != 0:
+        assert loaded_weight.shape[0] == 1
+        loaded_weight = loaded_weight[0]
+
+    return param[shard_id], loaded_weight
+
+
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -358,37 +381,15 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         output_dim = getattr(param, "output_dim", None)
         # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
-
-        param_shard_splitter = getattr(param, "shard_splitter", None)
-
-        if output_dim is not None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support output_dim != None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-        # If a parameter has defined a shard_splitter to be used for
-        # the weight, it should be applied before the weight is
-        # loaded/copied to the parameter. The shard_splitter applies
-        # logic by using the loaded_shard_id to ensure that the loaded
-        # param is loaded to the correct location
-        # within the parameter defined by the linear method.
-        if loaded_shard_id is None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support loaded_shard_id == None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
+        # Special case for per-tensor scale to load scalar into fused array.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
         if loaded_shard_id is None:
             # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
-                # If fp8 + scale, need to send to each shard.
-                if fp8_scales_shard_indexer is not None:
-                    param_data, loaded_weight = fp8_scales_shard_indexer(
-                        param_data, loaded_weight, loaded_shard_id)
+                if needs_scalar_to_array is not None:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0)
 
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
@@ -450,15 +451,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
 
-        # If a param_shard_splitter is defined by the LinearMethod, use it.
-        elif param_shard_splitter is not None:
-            logical_widths = getattr(param, "logical_widths", None)
-            param_data, loaded_weight = param_shard_splitter(
-                param_data, loaded_weight, loaded_shard_id, logical_widths)
-
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
                 param_data, loaded_weight, loaded_shard_id)
 
         else:
@@ -548,36 +543,15 @@ class QKVParallelLinear(ColumnParallelLinear):
         # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
 
-        param_shard_splitter = getattr(param, "shard_splitter", None)
-
-        if output_dim is not None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support output_dim != None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-        # If a parameter has defined a shard_splitter to be used for
-        # the weight, it should be applied before the weight is
-        # loaded/copied to the parameter. The shard_splitter applies
-        # logic by using the loaded_shard_id to ensure that the loaded
-        # param is loaded to the correct location
-        # within the parameter defined by the linear method.
-        if loaded_shard_id is None and param_shard_splitter is not None:
-            raise NotImplementedError(
-                "We do not currently support loaded_shard_id == None and "
-                "shard_splitter != None for a parameter. Please open an issue."
-            )
-
-        # Special case for Fp8 scales.
-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
-                                           None)
+        # Special case for per-tensor scales in fused case.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
         if loaded_shard_id is None:
             # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
-                # If fp8 + scale, need to send to each shard.
-                if fp8_scales_shard_indexer is not None:
-                    param_data, loaded_weight = fp8_scales_shard_indexer(
-                        param_data, loaded_weight, loaded_shard_id)
+                if needs_scalar_to_array is not None:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0)
 
                 assert param_data.shape == loaded_weight.shape
                 param_data.copy_(loaded_weight)
@@ -667,15 +641,9 @@ class QKVParallelLinear(ColumnParallelLinear):
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
-        # If a param_shard_splitter is defined by the LinearMethod, use it.
-        elif param_shard_splitter is not None:
-            logical_widths = getattr(param, "logical_widths", None)
-            param_data, loaded_weight = param_shard_splitter(
-                param_data, loaded_weight, loaded_shard_id, logical_widths)
-
-        # Special case for Fp8 scales.
-        elif fp8_scales_shard_indexer is not None:
-            param_data, loaded_weight = fp8_scales_shard_indexer(
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
                 param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index a451427ec..664eac3f9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -186,6 +186,9 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
     def __init__(self, quantization_config: CompressedTensorsConfig):
         self.quantization_config = quantization_config
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        return layer.scheme.process_weights_after_loading(layer)
+
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index 3a5904208..119f6cd91 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -31,3 +31,11 @@ class CompressedTensorsScheme(ABC):
 
         """
         raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index 0cfac13d1..f5911bc3d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -18,6 +18,9 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme):
     in a linear transformation.
     """
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 607029c81..3c07d6b6f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -29,6 +29,9 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
             raise ValueError(
                 "group_size must be given when using strategy group")
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
     def create_weights(self, layer: torch.nn.Module, input_size: int,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
index efed79ec7..497790576 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
@@ -15,70 +15,63 @@ class CompressedTensorsW8A8(CompressedTensorsScheme):
     def __init__(self, strategy: str):
         self.strategy = strategy
 
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        assert isinstance(shard_id, str)
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    def scales_shard_splitter(
-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
-            shard_id: Union[str, int],
-            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        shard_id = self._shard_id_as_int(shard_id)
-        offset = sum(logical_widths[:shard_id])
-        size = logical_widths[shard_id]
-        # update loaded weight with copies for broadcast.
-        loaded_weight = loaded_weight.repeat(size)
-        return param[offset:offset + size], loaded_weight
+    # Cutlass kernels support only per-tensor and per-channel cases.
+    # So if we have a fused module (QKV, MLP) with per tensor scales (thus N
+    # scales being passed to the kernel), we convert to the per-channel case.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if (self.strategy == QuantizationStrategy.TENSOR
+                and len(self.logical_widths) > 1):
+
+            # Load the N per-tensor scales into the channelwise buffer.
+            weight_scale_channel = torch.empty(
+                (sum(self.logical_widths), 1),
+                dtype=torch.float32,
+                device=layer.weight_scale.device)
+            start = 0
+            for idx, logical_width in enumerate(self.logical_widths):
+                end = start + logical_width
+                weight_scale_channel[start:end, :] = layer.weight_scale[idx]
+                start = end
+
+            layer.weight_scale = Parameter(weight_scale_channel,
+                                           requires_grad=False)
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
+        self.logical_widths = output_partition_sizes
 
-        is_tensor_partitioned = len(output_partition_sizes) != 1
-        weight_scale_dim = sum(output_partition_sizes) if (
-            is_tensor_partitioned
-            or self.strategy == QuantizationStrategy.CHANNEL) else 1
-
-        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
+        # WEIGHT SCALE
+        shape: Union[Tuple[int], Tuple[int, int]]
         if self.strategy == QuantizationStrategy.CHANNEL:
-            shape = (weight_scale_dim, 1)
+            shape = (sum(self.logical_widths), 1)
+        else:
+            shape = (len(self.logical_widths), )
 
         weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
                                  requires_grad=False)
-
         layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            set_weight_attrs(weight_scale, {
+                "weight_loader": weight_loader,
+                "output_dim": 0,
+            })
+        else:
+            set_weight_attrs(weight_scale, {
+                "weight_loader": weight_loader,
+                "needs_scalar_to_array": True,
+            })
 
+        # WEIGHT
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
                                        dtype=torch.int8),
                            requires_grad=False)
-
         layer.register_parameter("weight", weight)
-        set_weight_attrs(
-            weight, {
-                "input_dim": 1,
-                "output_dim": 0,
-                "weight_loader": weight_loader,
-                "logical_widths": output_partition_sizes
-            })
-
-        # Don't need a shard_splitter for channel-wise quantization
-        # Use the default loading method
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            set_weight_attrs(weight_scale, {
-                "output_dim": 0,
-            })
-        else:
-            set_weight_attrs(
-                weight_scale, {
-                    "logical_widths": output_partition_sizes,
-                    "shard_splitter": self.scales_shard_splitter,
-                })
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            "weight_loader": weight_loader,
+        })
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 7707ea6ee..224326005 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -29,6 +29,9 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
             raise ValueError(
                 "group_size must be given when using strategy group")
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
     def create_weights(self, layer: torch.nn.Module, input_size: int,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 1c760566c..df6fe4c3d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Module
@@ -98,7 +98,6 @@ class Fp8LinearMethod(LinearMethodBase):
     """
 
     def __init__(self, quant_config: Fp8Config):
-        self.fused_module_in_checkpoint = False
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
 
@@ -114,12 +113,10 @@ class Fp8LinearMethod(LinearMethodBase):
                           requires_grad=False)
         scale[:] = torch.finfo(torch.float8_e4m3fn).min
         layer.register_parameter(scale_name, scale)
-        set_weight_attrs(
-            scale, {
-                **extra_weight_attrs,
-                "fp8_scales_shard_indexer":
-                self.scales_shard_indexer,
-            })
+        set_weight_attrs(scale, {
+            **extra_weight_attrs,
+            "needs_scalar_to_array": True,
+        })
 
     def create_weights(
         self,
@@ -170,26 +167,6 @@ class Fp8LinearMethod(LinearMethodBase):
                     output_partition_sizes=output_partition_sizes,
                     **extra_weight_attrs)
 
-    def scales_shard_indexer(
-        self, param: torch.Tensor, loaded_weight: torch.Tensor,
-        shard_id: Optional[Union[str,
-                                 int]]) -> Tuple[torch.Tensor, torch.Tensor]:
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-
-        if shard_id is None:
-            shard_id = 0
-            self.fused_module_in_checkpoint = True
-        elif isinstance(shard_id, int):
-            pass
-        elif isinstance(shard_id, str):
-            if shard_id not in qkv_idxs:
-                raise ValueError(f"Unknown shard_id: {shard_id}")
-            shard_id = qkv_idxs[shard_id]
-        else:
-            ValueError(f"Shard id must be int or str but got {type(shard_id)}")
-
-        return param[shard_id], loaded_weight
-
     def process_weights_after_loading(self, layer: Module) -> None:
         if (not hasattr(layer, "process_after_load")
                 or not layer.process_after_load):
@@ -212,7 +189,17 @@ class Fp8LinearMethod(LinearMethodBase):
             #   Loop over logical weights, requantizing with single scale.
             max_w_scale = layer.weight_scale.max()
 
-            if not self.fused_module_in_checkpoint:
+            # QKV / MLP is fused in the on disk checkpoint if any of the
+            # weight scales are still set to the default since we initialize
+            # N weight scales for N shards but we only load 1 weight scale
+            # from disk in this case. As a result, we skip dequant -> requant
+            # since we already have quantized QKV together.
+            # Sample Model with fused checkpoint:
+            #   * nm-testing/Phi-3-mini-128k-instruct-FP8
+            unfused_module_in_checkpoint = (
+                layer.weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min)
+
+            if unfused_module_in_checkpoint:
                 start = 0
                 for idx, logical_width in enumerate(layer.logical_widths):
                     end = start + logical_width
-- 
GitLab


From 614aa5120303ab09be78fb1db669da198cc43b02 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 30 Jun 2024 20:07:34 -0700
Subject: [PATCH 220/376] [misc][cuda] use nvml to avoid accidentally cuda
 initialization (#6007)

---
 tests/kernels/test_cutlass.py                 |  3 +-
 tests/quantization/utils.py                   |  3 +-
 .../ops/blocksparse_attention/interface.py    |  6 +-
 vllm/attention/ops/prefix_prefill.py          |  4 +-
 .../device_communicators/custom_all_reduce.py | 58 ++-----------------
 vllm/lora/punica.py                           |  3 +-
 .../compressed_tensors/compressed_tensors.py  |  3 +-
 .../model_executor/layers/quantization/fp8.py |  4 +-
 .../layers/quantization/gptq_marlin.py        |  3 +-
 .../layers/quantization/utils/marlin_utils.py |  3 +-
 vllm/model_executor/model_loader/loader.py    |  4 +-
 vllm/utils.py                                 | 57 ++++++++++++++++++
 vllm/worker/worker.py                         |  3 +-
 13 files changed, 86 insertions(+), 68 deletions(-)

diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index d8e6d27b8..9cc18a0ea 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -8,12 +8,13 @@ import pytest
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.utils import get_device_capability_stateless
 
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
-capability = torch.cuda.get_device_capability()
+capability = get_device_capability_stateless()
 capability = capability[0] * 10 + capability[1]
 
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 29085916a..5c1b5ad9b 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,6 +1,7 @@
 import torch
 
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import get_device_capability_stateless
 
 
 def is_quant_method_supported(quant_method: str) -> bool:
@@ -8,7 +9,7 @@ def is_quant_method_supported(quant_method: str) -> bool:
     if not torch.cuda.is_available():
         return False
 
-    capability = torch.cuda.get_device_capability()
+    capability = get_device_capability_stateless()
     capability = capability[0] * 10 + capability[1]
     return (capability >=
             QUANTIZATION_METHODS[quant_method].get_min_capability())
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 300211e70..637cfda21 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -2,13 +2,13 @@ import math
 
 import torch
 
-from vllm.utils import is_cpu, is_hip
+from vllm.utils import get_device_capability_stateless, is_cpu, is_hip
 
 from .utils import (dense_to_crow_col, get_head_sliding_step,
                     get_sparse_attn_mask)
 
 IS_COMPUTE_8_OR_ABOVE = (torch.cuda.is_available()
-                         and torch.cuda.get_device_capability()[0] >= 8)
+                         and get_device_capability_stateless()[0] >= 8)
 
 if IS_COMPUTE_8_OR_ABOVE:
     from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd
@@ -235,4 +235,4 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
                                 v,
                                 cu_seqlens_k,
                                 cu_seqlens_q=cu_seqlens_q,
-                                sm_scale=sm_scale)
\ No newline at end of file
+                                sm_scale=sm_scale)
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index b99cf9a50..ca9f28fcb 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -5,6 +5,8 @@ import torch
 import triton
 import triton.language as tl
 
+from vllm.utils import get_device_capability_stateless
+
 if triton.__version__ >= "2.1.0":
 
     @triton.jit
@@ -683,7 +685,7 @@ if triton.__version__ >= "2.1.0":
                               alibi_slopes=None,
                               sliding_window=None):
 
-        cap = torch.cuda.get_device_capability()
+        cap = get_device_capability_stateless()
         BLOCK = 128 if cap[0] >= 8 else 64
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index b0cb21a02..a303d0bd2 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -11,66 +11,18 @@ from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import is_in_the_same_node
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless
+from vllm.utils import cuda_device_count_stateless, is_full_nvlink
 
 try:
-    import pynvml
-
-    # Simulate ImportError if custom_ar ops are not supported.
-    if not ops.is_custom_op_supported("_C_custom_ar::meta_size"):
-        raise ImportError("custom_ar", __file__)
-
+    assert ops.is_custom_op_supported("_C_custom_ar::meta_size")
     custom_ar = True
-
-    @contextmanager
-    def _nvml():
-        try:
-            pynvml.nvmlInit()
-            yield
-        finally:
-            pynvml.nvmlShutdown()
-
-except ImportError:
-    # For AMD GPUs
+except Exception:
+    # For AMD GPUs and CPUs
     custom_ar = False
-    pynvml = None
-
-    @contextmanager
-    def _nvml():
-        try:
-            yield
-        finally:
-            pass
-
 
 logger = init_logger(__name__)
 
 
-@_nvml()
-def _is_full_nvlink(device_ids: List[int]) -> bool:
-    """
-    query if the set of gpus are fully connected by nvlink (1 hop)
-    Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`,
-    so it works on real physical device ids.
-    """
-    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
-    for i, handle in enumerate(handles):
-        for j, peer_handle in enumerate(handles):
-            if i < j:
-                try:
-                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
-                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
-                        return False
-                except pynvml.NVMLError as error:
-                    logger.error(
-                        "NVLink detection failed. This is normal if your"
-                        " machine has no NVLink equipped.",
-                        exc_info=error)
-                    return False
-    return True
-
-
 def _can_p2p(rank: int, world_size: int) -> bool:
     for i in range(world_size):
         if i == rank:
@@ -161,7 +113,7 @@ class CustomAllreduce:
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        full_nvlink = _is_full_nvlink(physical_device_ids)
+        full_nvlink = is_full_nvlink(physical_device_ids)
         if world_size > 2 and not full_nvlink:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 7ecaa450f..f30b2c13f 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -5,13 +5,14 @@ from typing import Optional
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.utils import get_device_capability_stateless
 
 
 def _check_punica_support():
     if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
         return
 
-    if torch.cuda.get_device_capability() < (8, 0):
+    if get_device_capability_stateless() < (8, 0):
         raise ImportError(
             "punica LoRA kernels require compute capability >= 8.0")
     else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 664eac3f9..491396c3d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     find_first_name_or_class_match)
+from vllm.utils import get_device_capability_stateless
 
 
 class CompressedTensorsConfig(QuantizationConfig):
@@ -84,7 +85,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         return []
 
     def _check_gptq_and_marlin_can_run(self):
-        capability = torch.cuda.get_device_capability()
+        capability = get_device_capability_stateless()
         capability = capability[0] * 10 + capability[1]
         if capability < 80:
             raise RuntimeError("The quantization config is not supported for ",
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index df6fe4c3d..5d503a221 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import print_warning_once
+from vllm.utils import get_device_capability_stateless, print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -18,7 +18,7 @@ logger = init_logger(__name__)
 
 
 def cutlass_fp8_supported() -> bool:
-    capability = torch.cuda.get_device_capability()
+    capability = get_device_capability_stateless()
     capability = capability[0] * 10 + capability[1]
 
     return ops.cutlass_scaled_mm_supports_fp8(capability)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 599070f15..c6e9279c8 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.utils import get_device_capability_stateless
 
 logger = init_logger(__name__)
 
@@ -165,7 +166,7 @@ class GPTQMarlinConfig(QuantizationConfig):
             return False
 
         # If the capability of the device is too low, cannot convert.
-        major, minor = torch.cuda.get_device_capability()
+        major, minor = get_device_capability_stateless()
         device_capability = major * 10 + minor
         if device_capability < cls.get_min_capability():
             return False
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 0d027d062..6f4aa2d77 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -12,8 +12,9 @@ from vllm.model_executor.layers.quantization.utils.marlin_perms import (
     marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     get_pack_factor, quantize_weights, sort_weights)
+from vllm.utils import get_device_capability_stateless
 
-__cuda_arch = torch.cuda.get_device_capability()
+__cuda_arch = get_device_capability_stateless()
 
 MARLIN_TILE = 16
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index e91bf7cf3..361dc7322 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -35,7 +35,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.models.interfaces import (supports_lora,
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import is_tpu
+from vllm.utils import get_device_capability_stateless, is_tpu
 
 logger = init_logger(__name__)
 
@@ -46,7 +46,7 @@ def _get_quantization_config(
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config, load_config)
-        capability = torch.cuda.get_device_capability()
+        capability = get_device_capability_stateless()
         capability = capability[0] * 10 + capability[1]
         if capability < quant_config.get_min_capability():
             raise ValueError(
diff --git a/vllm/utils.py b/vllm/utils.py
index 6e8d4624c..1977bc05d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -816,6 +816,63 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+
+try:
+    import pynvml
+except ImportError:
+    # For non-NV devices
+    pynvml = None
+
+
+def with_nvml_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if pynvml is not None:
+            pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            if pynvml is not None:
+                pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+@with_nvml_context
+def is_full_nvlink(device_ids: List[int]) -> bool:
+    """
+    query if the set of gpus are fully connected by nvlink (1 hop)
+    """
+    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
+    for i, handle in enumerate(handles):
+        for j, peer_handle in enumerate(handles):
+            if i < j:
+                try:
+                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                        return False
+                except pynvml.NVMLError as error:
+                    logger.error(
+                        "NVLink detection failed. This is normal if your"
+                        " machine has no NVLink equipped.",
+                        exc_info=error)
+                    return False
+    return True
+
+
+@lru_cache(maxsize=8)
+@with_nvml_context
+def get_device_capability_stateless(device_id: int = 0) -> Tuple[int, int]:
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+
+
 #From: https://stackoverflow.com/a/4104188/2749989
 def run_once(f):
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 156d5278a..cc27d06b5 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -16,6 +16,7 @@ from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.sequence import ExecuteModelRequest
+from vllm.utils import get_device_capability_stateless
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -322,7 +323,7 @@ def init_worker_distributed_environment(
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
     if torch_dtype == torch.bfloat16:
-        compute_capability = torch.cuda.get_device_capability()
+        compute_capability = get_device_capability_stateless()
         if compute_capability[0] < 8:
             gpu_name = torch.cuda.get_device_name()
             raise ValueError(
-- 
GitLab


From 80ca1e6a3a28a0373dc00c5b4fe956c16de952fa Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Mon, 1 Jul 2024 00:33:05 -0700
Subject: [PATCH 221/376] [Speculative Decoding 2/2 ] Integrate typical
 acceptance sampler into Spec Decode Worker (#5348)

---
 .../test_typical_acceptance_sampler.py        |  96 +++++++----
 .../e2e/test_multistep_correctness.py         |  54 +++++-
 tests/spec_decode/test_dynamic_spec_decode.py |  12 +-
 tests/spec_decode/test_metrics.py             |  94 +++++------
 tests/spec_decode/test_spec_decode_worker.py  | 154 ++++++++++--------
 tests/spec_decode/test_utils.py               |  22 +++
 vllm/config.py                                |  75 ++++++++-
 vllm/engine/arg_utils.py                      |  42 ++++-
 vllm/engine/metrics.py                        |   2 +-
 .../layers/rejection_sampler.py               |  18 +-
 .../layers/spec_decode_base_sampler.py        |  15 +-
 .../layers/typical_acceptance_sampler.py      |  22 ++-
 vllm/spec_decode/metrics.py                   |  24 +--
 vllm/spec_decode/spec_decode_worker.py        |  62 ++++---
 14 files changed, 482 insertions(+), 210 deletions(-)

diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 87cf37bc9..4f6290795 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -52,6 +52,19 @@ def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
     return draft_token_ids
 
 
+def get_acceptance_sampler(
+    posterior_threshold: float = 0.03,
+    posterior_alpha: float = 0.9,
+    disable_bonus_tokens: bool = False,
+    strict_mode: bool = False,
+) -> TypicalAcceptanceSampler:
+    """
+    Initializes and returns a TypicalAcceptanceSampler.
+    """
+    return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha,
+                                    disable_bonus_tokens, strict_mode)
+
+
 @pytest.mark.parametrize("k", list(range(1, 6)))
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
@@ -64,7 +77,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
     different combinations of k, vocab_size, batch_size and num devices.
     """
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler()
+    typical_acceptance_sampler = get_acceptance_sampler()
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
@@ -76,7 +89,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     size=(batch_size, k),
                                     dtype=torch.int64)
     # Verify that sampling succeeds for all cases.
-    typical_acceptance_sampler(target_probs, bonus_token_ids, draft_token_ids)
+    typical_acceptance_sampler(target_probs,
+                               bonus_token_ids,
+                               draft_probs=None,
+                               draft_token_ids=draft_token_ids)
 
 
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
@@ -94,7 +110,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(strict_mode=True)
+    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
@@ -125,8 +141,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     oob_token_ids[0][0] = rogue_token_id
 
     with pytest.raises(AssertionError):
-        typical_acceptance_sampler(target_probs, bonus_token_ids,
-                                   draft_token_ids)
+        typical_acceptance_sampler(target_probs,
+                                   bonus_token_ids,
+                                   draft_probs=None,
+                                   draft_token_ids=draft_token_ids)
 
 
 @pytest.mark.parametrize("seed", list(range(10)))
@@ -151,7 +169,7 @@ def test_uniform_target_distribution_accepts_all_tokens(
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
@@ -163,9 +181,11 @@ def test_uniform_target_distribution_accepts_all_tokens(
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     # We are using a uniform target probability distribution.
     # For a uniform distribution the entropy is very high and it
     # should lead to all draft tokens being accepted. Verify that.
@@ -203,7 +223,7 @@ def test_temperature_zero_target_distribution(seed: int,
     vocab_size = 30_000
     torch.set_default_device(device)
 
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Simulate temperature 0 probability distribution for target probabilities
@@ -224,9 +244,11 @@ def test_temperature_zero_target_distribution(seed: int,
     # 1.0 tokens in the target distribution we will reject all of them and
     # fallback to the greedy sampling for selecting 1 token for each sequence.
     # Verify the same.
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, -1] == -1)
@@ -261,7 +283,7 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
     batch_size = 4
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # For sequences 0 and 2 set the distribution to a temperature
@@ -277,9 +299,11 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     # verify the shape of output_token_ids
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
@@ -326,7 +350,7 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
     batch_size = 1
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Create a temperature zero target probability distribution and ensure
@@ -339,9 +363,11 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
@@ -357,9 +383,11 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
         batch_size, k, vocab_size, zero_temperature_token_ids)
     draft_token_ids = torch.cat(
         (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
@@ -384,7 +412,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
     batch_size = 1
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     # Simulate temperature 0 probability distribution for target
@@ -402,9 +430,11 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 1:-1] == -1)
@@ -418,9 +448,11 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
         posterior_threshold=0.0,
         posterior_alpha=0.0)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
-    output_token_ids = typical_acceptance_sampler(target_probs,
-                                                  bonus_token_ids,
-                                                  draft_token_ids)
+    output_token_ids = typical_acceptance_sampler(
+        target_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids)
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
@@ -451,7 +483,7 @@ def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool,
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
+    typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(rank=0)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index 94d71fb01..94cc36f22 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -11,9 +11,15 @@ distribution matches the target model's output distribution (up to hardware
 numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
 equality. This gives us good coverage of temp=0.
 
+At temp=0, the TypicalAcceptanceSampler ensures that only the tokens with the
+highest probability in the target distribution are accepted. Therefore, we can 
+expect greedy equality for the TypicalAcceptanceSampler at temp=0.
+
 For temp>0, we rely on unit tests on the rejection sampler to verify that the
 output distribution is the same with spec decode vs. no spec decode (this would
-be prohibitively expensive to run with a real model).
+be prohibitively expensive to run with a real model). Similarly, for the
+TypicalAcceptance sampler also, we rely on unit tests to validate temp>0
+test cases.
 
 NOTE: Speculative decoding's distribution equality requires that the measured
 distributions of the target model and proposal model be deterministic given the
@@ -611,3 +617,49 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
                                          batch_size,
                                          max_output_len=output_len,
                                          force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": k,
+            "spec_decoding_acceptance_method": "typical_acceptance_sampler"
+        }
+        # Try a range of common k.
+        for k in [1, 2, 3]
+    ])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_typical_acceptance_sampling(baseline_llm_generator,
+                                     test_llm_generator, batch_size: int,
+                                     output_len: int):
+    """Verify that speculative decoding produces exact equality to without spec
+    decode with TypicalAcceptanceSampler as the draft token acceptance
+    sampling method.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index bb6d1c23a..29ed96999 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -3,33 +3,35 @@ from unittest.mock import MagicMock, patch
 import pytest
 import torch
 
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
+from .test_utils import mock_spec_decode_sampler
 from .utils import create_batch, mock_worker
 
 
 @pytest.mark.parametrize('queue_size', [4])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('k', [1])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
+def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
+                             acceptance_sampler_method: str):
     """Verify that speculative tokens are disabled when the batch size
     exceeds the threshold.
     """
     disable_by_batch_size = 3
-
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     worker = SpecDecodeWorker(proposer_worker=draft_worker,
                               scorer_worker=target_worker,
-                              rejection_sampler=rejection_sampler,
+                              spec_decode_sampler=mock_spec_decode_sampler(
+                                  acceptance_sampler_method),
                               metrics_collector=metrics_collector,
                               disable_by_batch_size=disable_by_batch_size)
 
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 312878804..2918fabdd 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -10,16 +10,16 @@ from vllm.spec_decode.metrics import AsyncMetricsCollector
 def test_initial_call_returns_none():
     """Expect first call to get metrics to return None.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
-
-    collector = AsyncMetricsCollector(rej_sampler)
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(spec_decode_sampler)
     collector.init_gpu_tensors(rank=0)
     maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
     assert maybe_metrics is None
@@ -28,14 +28,14 @@ def test_initial_call_returns_none():
 def test_second_call_returns_metrics():
     """Expect second call to not return None.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -43,7 +43,7 @@ def test_second_call_returns_metrics():
         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
@@ -56,16 +56,16 @@ def test_second_call_returns_metrics():
 def test_nonzero_rank_noop(rank):
     """Verify nonzero ranks don't collect metrics.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
-
-    collector = AsyncMetricsCollector(rej_sampler)
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(spec_decode_sampler)
     collector.init_gpu_tensors(rank=rank)
     _ = collector.maybe_collect_rejsample_metrics(k=5)
     metrics = collector.maybe_collect_rejsample_metrics(k=5)
@@ -75,14 +75,14 @@ def test_nonzero_rank_noop(rank):
 def test_noop_until_time():
     """Verify metrics aren't collected until enough time passes.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -91,7 +91,7 @@ def test_noop_until_time():
         collect_interval_s + 0.1, collect_interval_s + 0.1
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
@@ -122,14 +122,14 @@ def test_initial_metrics_has_correct_values(has_data: bool):
     max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
         num_draft_tokens, k)
 
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = num_draft_tokens
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = num_draft_tokens
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -137,7 +137,7 @@ def test_initial_metrics_has_correct_values(has_data: bool):
         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index a20c793c9..527e7eddd 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -6,7 +6,6 @@ from unittest.mock import MagicMock
 import pytest
 import torch
 
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
@@ -16,23 +15,26 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                  split_num_cache_blocks_evenly)
 
+from .test_utils import mock_spec_decode_sampler
 from .utils import create_batch, create_sampler_output_list, mock_worker
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_draft_model(k: int, batch_size: int):
+def test_correctly_calls_draft_model(k: int, batch_size: int,
+                                     acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the draft worker with correct
     inputs. Everything else is mocked out.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
-
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
     exception_secret = 'artificial stop'
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
@@ -53,15 +55,16 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_target_model(k: int, batch_size: int):
+def test_correctly_calls_target_model(k: int, batch_size: int,
+                                      acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the target model with correct
     inputs. Everything else is mocked out.
     """
     draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
     target_worker = mock_worker(use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     draft_worker.device = 'cuda'
@@ -69,8 +72,9 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
     worker.init_device()
 
     vocab_size = 32_000
@@ -133,8 +137,11 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
+def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int,
+                                             acceptance_sampler_method: str):
     """Verify SpecDecodeWorker calls the rejection sampler with
     correct inputs. Everything else is mocked out.
     """
@@ -144,15 +151,14 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
     worker.init_device()
 
@@ -199,15 +205,16 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
     target_worker.execute_model.return_value = [target_output[0]]
 
     exception_secret = 'artificial stop'
-    rejection_sampler.side_effect = ValueError(exception_secret)
+
+    spec_decode_sampler.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
         worker.execute_model(execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
             num_lookahead_slots=k))
 
-    assert len(rejection_sampler.call_args_list) == 1
-    _, kwargs = rejection_sampler.call_args_list[0]
+    assert len(spec_decode_sampler.call_args_list) == 1
+    _, kwargs = spec_decode_sampler.call_args_list[0]
     actual = SimpleNamespace(**kwargs)
 
     assert torch.equal(actual.bonus_token_ids,
@@ -221,8 +228,11 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_correctly_formats_output(k: int, batch_size: int):
+def test_correctly_formats_output(k: int, batch_size: int,
+                                  acceptance_sampler_method: str):
     """Verify SpecDecodeWorker formats sampler output correctly.
     Everything else is mocked out.
     """
@@ -232,15 +242,13 @@ def test_correctly_formats_output(k: int, batch_size: int):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
-
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
     worker.init_device()
 
@@ -286,24 +294,23 @@ def test_correctly_formats_output(k: int, batch_size: int):
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    rejection_sampler_output = torch.randint(low=0,
-                                             high=vocab_size,
-                                             size=(batch_size, k + 1),
-                                             dtype=torch.int64,
-                                             device='cuda')
+    spec_decode_sampler_output = torch.randint(low=0,
+                                               high=vocab_size,
+                                               size=(batch_size, k + 1),
+                                               dtype=torch.int64,
+                                               device='cuda')
     for i in range(batch_size):
         minimum_accepted_tokens = 1
-        rejection_sampler_output[i][
+        spec_decode_sampler_output[i][
             -random.randint(minimum_accepted_tokens, k + 1):] = -1
 
-    rejection_sampler.return_value = rejection_sampler_output
-
+    spec_decode_sampler.return_value = spec_decode_sampler_output
     output = worker.execute_model(execute_model_req=ExecuteModelRequest(
         seq_group_metadata_list=seq_group_metadata_list,
         num_lookahead_slots=k))
 
     expected_output = create_sampler_output_list(
-        token_ids=rejection_sampler_output.transpose(0, 1),
+        token_ids=spec_decode_sampler_output.transpose(0, 1),
         probs=[None for _ in range(k + 1)],
         logprobs=[None for _ in range(k + 1)])
 
@@ -350,8 +357,11 @@ def test_correctly_formats_output(k: int, batch_size: int):
 @pytest.mark.parametrize('k', [1, 2])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('returns_metrics', [True, False])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
+def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool,
+                          acceptance_sampler_method: str):
     """Verify SpecDecodeWorker collects metrics.
     """
     vocab_size = 32_000
@@ -360,15 +370,14 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
                                vocab_size=vocab_size,
                                use_spec=False)
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
     worker.init_device()
 
@@ -414,17 +423,16 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    rejection_sampler_output = torch.randint(low=0,
-                                             high=vocab_size,
-                                             size=(batch_size, k + 1),
-                                             dtype=torch.int64,
-                                             device='cuda')
+    spec_decode_sampler_output = torch.randint(low=0,
+                                               high=vocab_size,
+                                               size=(batch_size, k + 1),
+                                               dtype=torch.int64,
+                                               device='cuda')
     for i in range(batch_size):
         minimum_accepted_tokens = 1
-        rejection_sampler_output[i][
+        spec_decode_sampler_output[i][
             -random.randint(minimum_accepted_tokens, k + 1):] = -1
-
-    rejection_sampler.return_value = rejection_sampler_output
+    spec_decode_sampler.return_value = spec_decode_sampler_output
 
     mock_rejsample_metrics = MagicMock(
         spec=SpecDecodeWorkerMetrics) if returns_metrics else None
@@ -445,15 +453,16 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
 
 @pytest.mark.parametrize('k', [0])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_k_equals_zero(k: int, batch_size: int):
+def test_k_equals_zero(k: int, batch_size: int,
+                       acceptance_sampler_method: str):
     """Verify that the SpecDecodeWorker calls the draft and target workers
     when k is zero. This happens during prefill.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     sampler_output = MagicMock(spec=SamplerOutput)
@@ -465,8 +474,9 @@ def test_k_equals_zero(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size,
                                                  k,
@@ -487,16 +497,17 @@ def test_k_equals_zero(k: int, batch_size: int):
 
 @pytest.mark.parametrize('k', [0, 5])
 @pytest.mark.parametrize('batch_size', [0])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_empty_input_batch(k: int, batch_size: int):
+def test_empty_input_batch(k: int, batch_size: int,
+                           acceptance_sampler_method: str):
     """Verify that the SpecDecodeWorker calls the draft and target workers
     when the input batch is empty. This can happen if the engine communicates
     to the workers information without scheduling a batch.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     sampler_output = MagicMock(spec=SamplerOutput)
@@ -508,8 +519,9 @@ def test_empty_input_batch(k: int, batch_size: int):
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size,
                                                  k,
@@ -528,18 +540,19 @@ def test_empty_input_batch(k: int, batch_size: int):
     target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @pytest.mark.skip_global_cleanup
-def test_init_device():
+def test_init_device(acceptance_sampler_method: str):
     """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
     well as other GPU initialization.
     """
     draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
     target_worker = mock_worker(use_spec=False)
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
+    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
 
     worker.init_device()
@@ -549,22 +562,23 @@ def test_init_device():
     target_worker.init_device.assert_called_once()
 
     metrics_collector.init_gpu_tensors.assert_called_once()
-    rejection_sampler.init_gpu_tensors.assert_called_once()
+    spec_decode_sampler.init_gpu_tensors.assert_called_once()
 
 
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @torch.inference_mode()
-def test_initialize_cache():
+def test_initialize_cache(acceptance_sampler_method):
     """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer
     workers.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023}
     worker.initialize_cache(**kwargs)
@@ -577,19 +591,20 @@ def test_initialize_cache():
 @pytest.mark.parametrize('available_cpu_blocks', [500])
 @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
 @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
+@pytest.mark.parametrize("acceptance_sampler_method",
+                         ["rejection_sampler", "typical_acceptance_sampler"])
 @pytest.mark.skip_global_cleanup
 def test_determine_num_available_blocks(available_gpu_blocks: int,
                                         available_cpu_blocks: int,
                                         target_cache_block_size_bytes: int,
-                                        draft_kv_size_bytes: int):
+                                        draft_kv_size_bytes: int,
+                                        acceptance_sampler_method: str):
     """Verify SpecDecodeWorker correctly profiles num available GPU blocks.
     Specifically, it should run profiling in the scorer worker, and then evenly
     split the blocks between proposer and scorer worker.
     """
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
-    rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     target_worker.determine_num_available_blocks.return_value = (
@@ -598,8 +613,9 @@ def test_determine_num_available_blocks(available_gpu_blocks: int,
         target_cache_block_size_bytes)
     draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
 
     num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks()
 
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index bccbf9a6a..18dbdd5bc 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -1,7 +1,11 @@
 from unittest.mock import MagicMock
 
 import pytest
+import torch
 
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.typical_acceptance_sampler import (
+    TypicalAcceptanceSampler)
 from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids
 from vllm.spec_decode.util import split_batch_by_proposal_len
 
@@ -109,3 +113,21 @@ def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata):
 
     assert filtered_groups == []
     assert indices == []
+
+
+def mock_spec_decode_sampler(acceptance_sampler_method):
+    """
+    Returns either a RejectionSampler or TypicalAcceptanceSampler
+    object depending on whether acceptance_sampler_method is 
+    'rejection_sampler' or 'typical_acceptance_sampler' respectively.
+    """
+    if acceptance_sampler_method == "rejection_sampler":
+        sampler = MagicMock(spec=RejectionSampler)
+        sampler.token_id_dtype = torch.int64
+        return sampler
+    elif acceptance_sampler_method == "typical_acceptance_sampler":
+        sampler = MagicMock(spec=TypicalAcceptanceSampler)
+        sampler.token_id_dtype = torch.int64
+        return sampler
+    else:
+        raise ValueError(f"Invalid sampler name {acceptance_sampler_method}")
diff --git a/vllm/config.py b/vllm/config.py
index 3551e8f6f..9854f1750 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -753,7 +753,6 @@ class SchedulerConfig:
         self.chunked_prefill_enabled = enable_chunked_prefill
         self.embedding_mode = embedding_mode
         self.preemption_mode = preemption_mode
-
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -834,6 +833,9 @@ class SpeculativeConfig:
         speculative_disable_by_batch_size: Optional[int],
         ngram_prompt_lookup_max: Optional[int],
         ngram_prompt_lookup_min: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: Optional[float],
+        typical_acceptance_sampler_posterior_alpha: Optional[float],
     ) -> Optional["SpeculativeConfig"]:
         """Create a SpeculativeConfig if possible, else return None.
 
@@ -870,7 +872,20 @@ class SpeculativeConfig:
                 window, if provided.
             ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
                 window, if provided.
-
+            draft_token_acceptance_method (str): The method to use for
+                accepting draft tokens. This can take two possible
+                values 'rejection_sampler' and 'typical_acceptance_sampler'
+                for RejectionSampler and TypicalAcceptanceSampler
+                respectively.
+            typical_acceptance_sampler_posterior_threshold (Optional[float]):
+                A threshold value that sets a lower bound on the posterior
+                probability of a token in the target model for it to be
+                accepted. This threshold is used only when we use the 
+                TypicalAcceptanceSampler for token acceptance.
+            typical_acceptance_sampler_posterior_alpha (Optional[float]):
+                A scaling factor for the entropy-based threshold in the
+                TypicalAcceptanceSampler.
+    
         Returns:
             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
                 the necessary conditions are met, else None.
@@ -984,6 +999,11 @@ class SpeculativeConfig:
                 "speculative_model unless the draft model config contains an "
                 "n_predict parameter.")
 
+        if typical_acceptance_sampler_posterior_threshold is None:
+            typical_acceptance_sampler_posterior_threshold = 0.09
+        if typical_acceptance_sampler_posterior_alpha is None:
+            typical_acceptance_sampler_posterior_alpha = 0.3
+
         return SpeculativeConfig(
             draft_model_config,
             draft_parallel_config,
@@ -991,6 +1011,11 @@ class SpeculativeConfig:
             speculative_disable_by_batch_size,
             ngram_prompt_lookup_max,
             ngram_prompt_lookup_min,
+            draft_token_acceptance_method=draft_token_acceptance_method,
+            typical_acceptance_sampler_posterior_threshold=\
+                typical_acceptance_sampler_posterior_threshold,
+            typical_acceptance_sampler_posterior_alpha=\
+                typical_acceptance_sampler_posterior_alpha,
         )
 
     @staticmethod
@@ -1072,6 +1097,9 @@ class SpeculativeConfig:
         speculative_disable_by_batch_size: Optional[int],
         ngram_prompt_lookup_max: Optional[int],
         ngram_prompt_lookup_min: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: float,
+        typical_acceptance_sampler_posterior_alpha: float,
     ):
         """Create a SpeculativeConfig object.
 
@@ -1085,6 +1113,19 @@ class SpeculativeConfig:
                 enqueue requests is larger than this value.
             ngram_prompt_lookup_max: Max size of ngram token window.
             ngram_prompt_lookup_min: Min size of ngram token window.
+            draft_token_acceptance_method (str): The method to use for
+                accepting draft tokens. This can take two possible
+                values 'rejection_sampler' and 'typical_acceptance_sampler'
+                for RejectionSampler and TypicalAcceptanceSampler
+                respectively.
+            typical_acceptance_sampler_posterior_threshold (Optional[float]):
+                A threshold value that sets a lower bound on the posterior
+                probability of a token in the target model for it to be
+                accepted. This threshold is used only when we use the 
+                TypicalAcceptanceSampler for token acceptance.
+            typical_acceptance_sampler_posterior_alpha (Optional[float]):
+                A scaling factor for the entropy-based threshold in the
+                TypicalAcceptanceSampler.
         """
         self.draft_model_config = draft_model_config
         self.draft_parallel_config = draft_parallel_config
@@ -1093,6 +1134,11 @@ class SpeculativeConfig:
             speculative_disable_by_batch_size
         self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
         self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
+        self.draft_token_acceptance_method = draft_token_acceptance_method
+        self.typical_acceptance_sampler_posterior_threshold = \
+            typical_acceptance_sampler_posterior_threshold
+        self.typical_acceptance_sampler_posterior_alpha = \
+            typical_acceptance_sampler_posterior_alpha
 
         self._verify_args()
 
@@ -1104,6 +1150,31 @@ class SpeculativeConfig:
         if self.draft_model_config:
             self.draft_model_config.verify_with_parallel_config(
                 self.draft_parallel_config)
+            # Validate and set draft token acceptance related settings.
+
+        if (self.draft_token_acceptance_method is None):
+            raise ValueError("draft_token_acceptance_method is not set. "
+                             "Expected values are rejection_sampler or "
+                             "typical_acceptance_sampler.")
+
+        if (self.draft_token_acceptance_method != 'rejection_sampler'
+                and self.draft_token_acceptance_method !=
+                'typical_acceptance_sampler'):
+            raise ValueError(
+                "Expected draft_token_acceptance_method to be either "
+                "rejection_sampler or typical_acceptance_sampler. Instead it "
+                f"is {self.draft_token_acceptance_method}")
+
+        if (self.typical_acceptance_sampler_posterior_threshold < 0
+                or self.typical_acceptance_sampler_posterior_alpha < 0):
+            raise ValueError(
+                "Expected typical_acceptance_sampler_posterior_threshold "
+                "and typical_acceptance_sampler_posterior_alpha to be > 0. "
+                "Instead found "
+                f"typical_acceptance_sampler_posterior_threshold = "
+                f"{self.typical_acceptance_sampler_posterior_threshold} and "
+                f"typical_acceptance_sampler_posterior_alpha = "
+                f"{self.typical_acceptance_sampler_posterior_alpha}")
 
     @property
     def num_lookahead_slots(self) -> int:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f9d089091..d4044adfc 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -100,7 +100,9 @@ class EngineArgs:
     speculative_disable_by_batch_size: Optional[int] = None
     ngram_prompt_lookup_max: Optional[int] = None
     ngram_prompt_lookup_min: Optional[int] = None
-
+    spec_decoding_acceptance_method: str = 'rejection_sampler'
+    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
+    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
     qlora_adapter_name_or_path: Optional[str] = None
 
     otlp_traces_endpoint: Optional[str] = None
@@ -577,6 +579,38 @@ class EngineArgs:
             help='Min size of window for ngram prompt lookup in speculative '
             'decoding.')
 
+        parser.add_argument(
+            '--spec-decoding-acceptance-method',
+            type=str,
+            default=EngineArgs.spec_decoding_acceptance_method,
+            choices=['rejection_sampler', 'typical_acceptance_sampler'],
+            help='Specify the acceptance method to use during draft token '
+            'verification in speculative decoding. Two types of acceptance '
+            'routines are supported: '
+            '1) RejectionSampler which does not allow changing the '
+            'acceptance rate of draft tokens, '
+            '2) TypicalAcceptanceSampler which is configurable, allowing for '
+            'a higher acceptance rate at the cost of lower quality, '
+            'and vice versa.')
+
+        parser.add_argument(
+            '--typical-acceptance-sampler-posterior-threshold',
+            type=float,
+            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
+            help='Set the lower bound threshold for the posterior '
+            'probability of a token to be accepted. This threshold is '
+            'used by the TypicalAcceptanceSampler to make sampling decisions '
+            'during speculative decoding. Defaults to 0.09')
+
+        parser.add_argument(
+            '--typical-acceptance-sampler-posterior-alpha',
+            type=float,
+            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
+            help='A scaling factor for the entropy-based threshold for token '
+            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
+            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
+            'i.e. 0.3')
+
         parser.add_argument('--model-loader-extra-config',
                             type=nullable_str,
                             default=EngineArgs.model_loader_extra_config,
@@ -737,6 +771,12 @@ class EngineArgs:
             use_v2_block_manager=self.use_v2_block_manager,
             ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
             ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
+            draft_token_acceptance_method=\
+                self.spec_decoding_acceptance_method,
+            typical_acceptance_sampler_posterior_threshold=self.
+            typical_acceptance_sampler_posterior_threshold,
+            typical_acceptance_sampler_posterior_alpha=self.
+            typical_acceptance_sampler_posterior_alpha,
         )
 
         scheduler_config = SchedulerConfig(
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 2c1210c90..77de42bc0 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -457,4 +457,4 @@ class PrometheusStatLogger(StatLoggerBase):
 
 class RayPrometheusStatLogger(PrometheusStatLogger):
     """RayPrometheusStatLogger uses Ray metrics instead."""
-    _metrics_cls = RayMetrics
+    _metrics_cls = RayMetrics
\ No newline at end of file
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index fe9b2fac1..e18961046 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -3,13 +3,12 @@ from typing import Tuple
 
 import torch
 import torch.jit
-import torch.nn as nn
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
 
 
-class RejectionSampler(SpecDecodeBaseSampler, nn.Module):
+class RejectionSampler(SpecDecodeBaseSampler):
     """Apply modified rejection sampling as described in "Accelerating Large
         Language Model Decoding with Speculative Sampling"
         https://arxiv.org/pdf/2302.01318.pdf.
@@ -28,8 +27,8 @@ class RejectionSampler(SpecDecodeBaseSampler, nn.Module):
             during sampling. This catches correctness issues but adds
             nontrivial latency.
         """
-        SpecDecodeBaseSampler.__init__(self, disable_bonus_tokens, strict_mode)
-        nn.Module.__init__(self)
+        super().__init__(disable_bonus_tokens=disable_bonus_tokens,
+                         strict_mode=strict_mode)
 
     def forward(
         self,
@@ -78,11 +77,12 @@ class RejectionSampler(SpecDecodeBaseSampler, nn.Module):
             self._raise_if_incorrect_input(target_probs, bonus_token_ids,
                                            draft_probs, draft_token_ids)
 
-        accepted, recovered_token_ids = self._batch_modified_rejection_sampling(
-            target_probs,
-            draft_probs,
-            draft_token_ids,
-        )
+        accepted, recovered_token_ids = (
+            self._batch_modified_rejection_sampling(
+                target_probs,
+                draft_probs,
+                draft_token_ids,
+            ))
 
         output_token_ids = self._create_output(
             accepted,
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 9856a7e7d..692024056 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,9 +1,12 @@
+from abc import abstractmethod
 from typing import Optional
 
 import torch
+import torch.jit
+import torch.nn as nn
 
 
-class SpecDecodeBaseSampler():
+class SpecDecodeBaseSampler(nn.Module):
     """Base class for samplers used for Speculative Decoding verification
         step.
     """
@@ -51,6 +54,16 @@ class SpecDecodeBaseSampler():
     def token_id_dtype(self):
         return torch.int64
 
+    @abstractmethod
+    def forward(
+        self,
+        target_probs: torch.Tensor,
+        bonus_token_ids: torch.Tensor,
+        draft_probs: torch.Tensor,
+        draft_token_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
     def _create_output(
             self,
             accepted: torch.Tensor,  # [batch_size, k]
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index f12d6a03b..9bf3c84a1 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -1,12 +1,11 @@
 import torch
 import torch.jit
-import torch.nn as nn
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
 
 
-class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
+class TypicalAcceptanceSampler(SpecDecodeBaseSampler):
     """Apply typical acceptance sampling as described in section 3.3.1 in 
         "MEDUSA: Simple LLM Inference Acceleration Framework with 
         Multiple Decoding Heads"
@@ -15,10 +14,10 @@ class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
 
     def __init__(
         self,
+        posterior_threshold: float,
+        posterior_alpha: float,
         disable_bonus_tokens: bool = False,
         strict_mode: bool = False,
-        posterior_threshold: float = 0.09,
-        posterior_alpha: float = 0.3,
     ):
         """Create a Typical Acceptance Sampler.
 
@@ -31,23 +30,20 @@ class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
             nontrivial latency.
             posterior_threshold : A threshold value that sets a lower bound 
             on the posterior probability of a token in target model for it
-            to be accepted. Default is 0.09
+            to be accepted.
             posterior_alpha : A scaling factor for the entropy-based
-            threshold in typical acceptance sampling. Typically defaults to
-            sqrt of posterior_threshold and is set to 0.3.
+            threshold in typical acceptance sampling.
         """
-        SpecDecodeBaseSampler.__init__(
-            self,
-            disable_bonus_tokens=disable_bonus_tokens,
-            strict_mode=strict_mode)
-        nn.Module.__init__(self)
         self._posterior_threshold = posterior_threshold
         self._posterior_alpha = posterior_alpha
+        super().__init__(disable_bonus_tokens=disable_bonus_tokens,
+                         strict_mode=strict_mode)
 
     def forward(
         self,
         target_probs: torch.Tensor,
         bonus_token_ids: torch.Tensor,
+        draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
     ) -> torch.Tensor:
         """Sample token ids using typical acceptance sampling. This accepts 
@@ -69,6 +65,8 @@ class TypicalAcceptanceSampler(SpecDecodeBaseSampler, nn.Module):
                 speculative tokens in a sequence are accepted.
             shape = [batch_size, num_bonus_tokens]
 
+            draft_probs: This parameter is unused by the acceptance sampler.
+
             draft_token_ids: The token ids that were sampled from the draft
                 probabilities.
             shape = [batch_size, num_speculative_tokens]
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index ab1d96c55..2c4ae0b22 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -4,7 +4,8 @@ from typing import Callable, Optional
 
 import torch
 
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
 from vllm.utils import is_pin_memory_available
 
 
@@ -46,15 +47,15 @@ Timer = Callable[[], float]
 
 
 class AsyncMetricsCollector:
-    """Class which copies rejection sampler metrics from the device to CPU on a
-    non-default Torch stream.
+    """Class which copies rejection/typical-acceptance sampler metrics
+    from the device to CPU on a non-default Torch stream.
     """
 
     def __init__(self,
-                 rejection_sampler: RejectionSampler,
+                 spec_decode_sampler: SpecDecodeBaseSampler,
                  timer: Optional[Timer] = None,
                  collect_interval_s: float = 5.0):
-        self._rejection_sampler = rejection_sampler
+        self.spec_decode_sampler = spec_decode_sampler
         self._timer = time.time if timer is None else timer
 
         self._rank: Optional[int] = None
@@ -95,7 +96,7 @@ class AsyncMetricsCollector:
         return None
 
     def _should_collect_rejsample_metrics(self, now: float) -> bool:
-        """Return whether or not this iteration should print rejection sampling
+        """Return whether or not this iteration should print sampling
         metrics.
         """
         if self._rank != 0:
@@ -107,8 +108,8 @@ class AsyncMetricsCollector:
         return True
 
     def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
-        """Copy rejection sampling metrics (number of accepted tokens, etc) to
-        CPU asynchronously.
+        """Copy rejection/typical-acceptance sampling metrics 
+        (number of accepted tokens, etc) to CPU asynchronously.
 
         Returns a CUDA event recording when the copy is complete.
         """
@@ -117,13 +118,14 @@ class AsyncMetricsCollector:
 
         with torch.cuda.stream(self._copy_stream):
             self._aggregate_num_accepted_tokens.copy_(
-                self._rejection_sampler.num_accepted_tokens, non_blocking=True)
+                self.spec_decode_sampler.num_accepted_tokens,
+                non_blocking=True)
             self._aggregate_num_emitted_tokens.copy_(
-                self._rejection_sampler.num_emitted_tokens, non_blocking=True)
+                self.spec_decode_sampler.num_emitted_tokens, non_blocking=True)
             # Number of draft tokens is calculated on CPU, so no copy is
             # required.
             self._aggregate_num_draft_tokens = (
-                self._rejection_sampler.num_draft_tokens)
+                self.spec_decode_sampler.num_draft_tokens)
 
         aggregate_metrics_ready = torch.cuda.Event()
         aggregate_metrics_ready.record(self._copy_stream)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index f1e64cae8..ca470bee2 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -7,6 +7,10 @@ from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler)
+from vllm.model_executor.layers.typical_acceptance_sampler import (
+    TypicalAcceptanceSampler)
 from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
                            HiddenStates, SamplerOutput, SequenceGroupMetadata,
                            get_all_seq_ids)
@@ -56,7 +60,12 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
         draft_worker_kwargs=draft_worker_kwargs,
         disable_by_batch_size=speculative_config.
         speculative_disable_by_batch_size,
-    )
+        draft_token_acceptance_method=speculative_config.
+        draft_token_acceptance_method,
+        typical_acceptance_sampler_posterior_threshold=speculative_config.
+        typical_acceptance_sampler_posterior_threshold,
+        typical_acceptance_sampler_posterior_alpha=speculative_config.
+        typical_acceptance_sampler_posterior_alpha)
 
     return spec_decode_worker
 
@@ -78,8 +87,6 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         welcome!).
     * Only top-1 proposal and scoring are implemented. Tree-attention is left as
         future work.
-    * Only lossless rejection sampling is supported. Contributions adding lossy
-        verification routines are welcome (e.g. Medusa's typical acceptance).
     * All sequences in a batch must have the same proposal length, or zero. This
         can be improved by having per-sequence speculation in the future.
     * The scoring forward pass is done without an MQA kernel, which is
@@ -95,6 +102,9 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         scorer_worker: Worker,
         draft_worker_kwargs: Dict[str, Any],
         disable_by_batch_size: Optional[int],
+        draft_token_acceptance_method: str,
+        typical_acceptance_sampler_posterior_threshold: float,
+        typical_acceptance_sampler_posterior_alpha: float,
     ) -> "SpecDecodeWorker":
 
         ngram_prompt_lookup_max = (
@@ -127,17 +137,30 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         logger.info("Configuring SpecDecodeWorker with proposer=%s",
                     type(proposer_worker))
 
+        spec_decode_sampler: SpecDecodeBaseSampler = None
+        if draft_token_acceptance_method == "rejection_sampler":
+            spec_decode_sampler = RejectionSampler(
+                disable_bonus_tokens=disable_bonus_tokens, )
+        elif draft_token_acceptance_method == "typical_acceptance_sampler":
+            spec_decode_sampler = TypicalAcceptanceSampler(
+                disable_bonus_tokens=disable_bonus_tokens,
+                posterior_threshold=\
+                    typical_acceptance_sampler_posterior_threshold,
+                posterior_alpha=typical_acceptance_sampler_posterior_alpha,
+            )
+        logger.info("Configuring SpecDecodeWorker with sampler=%s",
+                    type(spec_decode_sampler))
+
         return SpecDecodeWorker(proposer_worker,
                                 scorer_worker,
                                 disable_by_batch_size=disable_by_batch_size,
-                                rejection_sampler=RejectionSampler(
-                                    disable_bonus_tokens=disable_bonus_tokens))
+                                spec_decode_sampler=spec_decode_sampler)
 
     def __init__(
         self,
         proposer_worker: ProposerWorkerBase,
         scorer_worker: WorkerBase,
-        rejection_sampler: RejectionSampler,
+        spec_decode_sampler: SpecDecodeBaseSampler,
         metrics_collector: Optional[AsyncMetricsCollector] = None,
         disable_by_batch_size: Optional[int] = None,
     ):
@@ -150,8 +173,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             scorer_worker: A worker that produces probabilities of speculative
                 tokens according to some base model. Typically a vanilla vLLM
                 Worker.
-            rejection_sampler: A Torch module used to perform modified rejection
-                sampling for speculative decoding.
+            spec_decode_sampler: A Torch module used to perform acceptance
+                sampling of the draft tokens in the verification step of
+                speculative decoding. Currently we support two different 
+                types of sampler namely RejectionSampler and
+                TypicalAcceptanceSampler. 'spec_decode_sampler' is either an
+                instance of RejectionSampler or TypicalAcceptanceSampler.
             disable_by_batch_size: If the batch size is larger than this,
                 disable speculative decoding for new incoming requests.
             metrics_collector: Helper class for collecting metrics; can be set
@@ -160,15 +187,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         self.proposer_worker = proposer_worker
         self.scorer_worker = scorer_worker
         self.disable_by_batch_size = disable_by_batch_size or float("inf")
-        self.rejection_sampler = rejection_sampler
-
+        self.spec_decode_sampler = spec_decode_sampler
         self._metrics = AsyncMetricsCollector(
-            rejection_sampler
+            self.spec_decode_sampler
         ) if metrics_collector is None else metrics_collector
-
-        self.probs_dtype = self.rejection_sampler.probs_dtype
-        self.token_id_dtype = self.rejection_sampler.token_id_dtype
-
+        self.probs_dtype = self.spec_decode_sampler.probs_dtype
+        self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
         # Lazy initiazliation.
         self.scorer: SpeculativeScorer
 
@@ -189,7 +213,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         self.proposer_worker.load_model()
 
         self._metrics.init_gpu_tensors(self.rank)
-        self.rejection_sampler.init_gpu_tensors(self.rank)
+        self.spec_decode_sampler.init_gpu_tensors(self.rank)
+
         self.scorer = BatchExpansionTop1Scorer(
             scorer_worker=self.scorer_worker,
             device=self.device,
@@ -203,7 +228,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     def _configure_model_sampler_for_spec_decode(self):
         """Configure model sampler to emit GPU tensors. This allows spec decode
         to keep data on device without transferring to CPU and serializing,
-        which significantly reduces overhead of rejection sampling.
+        which significantly reduces overhead of sampling during verification.
 
         NOTE(cade): This breaks abstraction boundaries pretty badly. The better
         design is to have the "move to CPU and serialize" sampling decision be
@@ -481,7 +506,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         # Get proposed tokens.
         proposal_token_ids = proposals.proposal_token_ids[spec_indices]
 
-        accepted_token_ids = self.rejection_sampler(
+        accepted_token_ids = self.spec_decode_sampler(
             target_probs=proposal_verifier_probs,
             bonus_token_ids=bonus_token_ids,
             draft_probs=proposal_probs,
@@ -496,7 +521,6 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         accepted_token_ids = torch.cat(
             [accepted_token_ids, non_spec_token_ids])
         logprobs = proposal_scores.logprobs
-
         # Rearrange so that results are in the order of the original seq group
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
-- 
GitLab


From d76084c12f0400a8f8364883a841405c903b61f6 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Mon, 1 Jul 2024 12:40:45 -0400
Subject: [PATCH 222/376] [ CI ] Re-enable Large Model LM Eval (#6031)

---
 .buildkite/test-pipeline.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c51702886..d96e3c6d1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -198,6 +198,15 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
+- label: LM Eval Large Models
+  gpu: a100
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+
 - label: Documentation Build
   working_dir: "/vllm-workspace/test_docs/docs"
   no_gpu: True
-- 
GitLab


From 4050d646e5221a516c93384b047e10b22d7167e7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 1 Jul 2024 09:52:43 -0700
Subject: [PATCH 223/376] [doc][misc] remove deprecated api server in doc
 (#6037)

---
 docs/source/serving/distributed_serving.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 2a7937a91..91f64ad2e 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -19,7 +19,7 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.api_server \
+    $ python -m vllm.entrypoints.openai.api_server \
     $     --model facebook/opt-13b \
     $     --tensor-parallel-size 4
 
-- 
GitLab


From bb6032683687b817b026a38c5a8729c2cd861246 Mon Sep 17 00:00:00 2001
From: zhyncs <me@zhyncs.com>
Date: Tue, 2 Jul 2024 01:20:33 +0800
Subject: [PATCH 224/376] [Misc] update benchmark backend for scalellm (#6018)

---
 benchmarks/backend_request_func.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 5b5067090..fd2461ce5 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -422,4 +422,5 @@ ASYNC_REQUEST_FUNCS = {
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
     "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
 }
-- 
GitLab


From 8893130b63e6b2778b5d3db4dbc83ff6206ba44a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 1 Jul 2024 10:50:56 -0700
Subject: [PATCH 225/376] [doc][misc] further lower visibility of simple api
 server (#6041)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 examples/api_client.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/api_client.py b/examples/api_client.py
index 70ec8c549..5f7daa14d 100644
--- a/examples/api_client.py
+++ b/examples/api_client.py
@@ -1,4 +1,9 @@
-"""Example Python client for vllm.entrypoints.api_server"""
+"""Example Python client for vllm.entrypoints.api_server
+NOTE: The API server is used only for demonstration and simple performance
+benchmarks. It is not intended for production use.
+For production use, we recommend vllm.entrypoints.openai.api_server
+and the OpenAI client API
+"""
 
 import argparse
 import json
-- 
GitLab


From dec6fc6f3bc56a487af4d8d23e18dd227360ef98 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 1 Jul 2024 13:12:40 -0700
Subject: [PATCH 226/376] [Bugfix] Use RayActorError for older versions of Ray
 in  RayTokenizerGroupPool (#6039)

---
 .../tokenizer_group/ray_tokenizer_group.py                  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 21ec2b52b..799ca7d3f 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -2,7 +2,11 @@ import asyncio
 import os
 from typing import List, Optional
 
-from ray.exceptions import ActorDiedError
+try:
+    from ray.exceptions import ActorDiedError
+except ImportError:
+    # For older versions of Ray
+    from ray.exceptions import RayActorError as ActorDiedError
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 from transformers import PreTrainedTokenizer
 
-- 
GitLab


From 12a59959ed3a78a50f9b48b0ec6ccd6c862b22b4 Mon Sep 17 00:00:00 2001
From: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Date: Tue, 2 Jul 2024 00:08:29 +0300
Subject: [PATCH 227/376] [Bugfix] adding chunking mechanism to fused_moe to
 handle large inputs (#6029)

---
 tests/kernels/test_moe.py                     |   2 +-
 vllm/envs.py                                  |   3 +
 .../layers/fused_moe/fused_moe.py             | 117 +++++++++++-------
 3 files changed, 74 insertions(+), 48 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 2356b9ec1..22b6769ac 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -29,7 +29,7 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
 @pytest.mark.parametrize("e", [8, 64])
diff --git a/vllm/envs.py b/vllm/envs.py
index e8257535f..c624510c7 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
     VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
     VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
     VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -248,6 +249,8 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     # Only used for XLA devices such as TPUs.
     "VLLM_XLA_CACHE_PATH":
     lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
+    "VLLM_FUSED_MOE_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index ecab77a8b..99a5c7d78 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -8,6 +8,7 @@ import torch
 import triton
 import triton.language as tl
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 
@@ -420,13 +421,12 @@ def fused_experts(hidden_states: torch.Tensor,
         torch.float32, torch.float16, torch.bfloat16
     ]
 
-    M, _ = hidden_states.shape
+    num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
-
-    if M > 65536:
-        # https://github.com/vllm-project/vllm/issues/5938
-        raise ValueError("MoE kernel does not support more than 65536 tokens, "
-                         f"but got {M}")
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+    M = min(num_tokens, CHUNK_SIZE)
 
     if override_config:
         config = override_config
@@ -455,51 +455,74 @@ def fused_experts(hidden_states: torch.Tensor,
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
 
-    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, config['BLOCK_SIZE_M'], E)
     compute_type = (tl.bfloat16
                     if hidden_states.dtype == torch.bfloat16 else tl.float16)
 
-    invoke_fused_moe_kernel(hidden_states,
-                            w1,
-                            intermediate_cache1,
-                            a1_scale,
-                            w1_scale,
-                            topk_weights,
-                            topk_ids,
-                            sorted_token_ids,
-                            expert_ids,
-                            num_tokens_post_padded,
-                            False,
-                            topk_ids.shape[1],
-                            config,
-                            compute_type=compute_type,
-                            use_fp8=use_fp8)
-
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
-
-    invoke_fused_moe_kernel(intermediate_cache2,
-                            w2,
-                            intermediate_cache3,
-                            a2_scale,
-                            w2_scale,
-                            topk_weights,
-                            topk_ids,
-                            sorted_token_ids,
-                            expert_ids,
-                            num_tokens_post_padded,
-                            True,
-                            1,
-                            config,
-                            compute_type=compute_type,
-                            use_fp8=use_fp8)
-
     if inplace:
-        return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                         dim=1,
-                         out=hidden_states)
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        if tokens_in_chunk < CHUNK_SIZE:
+            # will only happen in the last chunk
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = (
+            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'], E))
+
+        invoke_fused_moe_kernel(curr_hidden_states,
+                                w1,
+                                intermediate_cache1,
+                                a1_scale,
+                                w1_scale,
+                                curr_topk_weights,
+                                curr_topk_ids,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                False,
+                                topk_ids.shape[1],
+                                config,
+                                compute_type=compute_type,
+                                use_fp8=use_fp8)
+
+        ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+
+        invoke_fused_moe_kernel(intermediate_cache2,
+                                w2,
+                                intermediate_cache3,
+                                a2_scale,
+                                w2_scale,
+                                curr_topk_weights,
+                                curr_topk_ids,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                True,
+                                1,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8=use_fp8)
+
+        torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                  dim=1,
+                  out=out_hidden_states[begin_chunk_idx:end_chunk_idx])
+    return out_hidden_states
 
 
 def fused_moe(
-- 
GitLab


From 83bdcb6ac32f22f20fd3a60ba67064a0b462801d Mon Sep 17 00:00:00 2001
From: "ning.zhang" <10524065+llmpros@users.noreply.github.com>
Date: Mon, 1 Jul 2024 14:11:36 -0700
Subject: [PATCH 228/376] add FAQ doc under 'serving' (#5946)

---
 docs/source/index.rst       |  1 +
 docs/source/serving/faq.rst | 12 ++++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 docs/source/serving/faq.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8fd25ce82..e99a0a9a1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -84,6 +84,7 @@ Documentation
    serving/usage_stats
    serving/integrations
    serving/tensorizer
+   serving/faq
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/serving/faq.rst b/docs/source/serving/faq.rst
new file mode 100644
index 000000000..daa81d7c8
--- /dev/null
+++ b/docs/source/serving/faq.rst
@@ -0,0 +1,12 @@
+Frequently Asked Questions
+========================
+
+    Q: How can I serve multiple models on a single port using the OpenAI API?
+
+A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly.
+
+----------------------------------------
+
+    Q: Which model to use for offline inference embedding?
+
+A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
\ No newline at end of file
-- 
GitLab


From 8e0817c262da5c104f651a0ce4ac9ee0cd76f4ce Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 1 Jul 2024 15:09:11 -0700
Subject: [PATCH 229/376] [Bugfix][Doc] Fix Doc Formatting (#6048)

---
 docs/source/serving/faq.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/serving/faq.rst b/docs/source/serving/faq.rst
index daa81d7c8..7b0374be8 100644
--- a/docs/source/serving/faq.rst
+++ b/docs/source/serving/faq.rst
@@ -1,5 +1,5 @@
 Frequently Asked Questions
-========================
+===========================
 
     Q: How can I serve multiple models on a single port using the OpenAI API?
 
@@ -9,4 +9,4 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
 
     Q: Which model to use for offline inference embedding?
 
-A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
\ No newline at end of file
+A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
-- 
GitLab


From c4059ea54ff36e62b03f1a88baa41ca72dc695e4 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 1 Jul 2024 16:08:58 -0700
Subject: [PATCH 230/376] [Bugfix] Add explicit `end_forward` calls to
 flashinfer (#6044)

---
 vllm/attention/backends/flashinfer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 4ecac7379..4d023282f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -126,6 +126,7 @@ class FlashInferMetadata(AttentionMetadata):
             self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
             self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
                 self.device)
+            self.prefill_wrapper.end_forward()
             self.prefill_wrapper.begin_forward(
                 self.query_start_loc, self.paged_kv_indptr,
                 self.paged_kv_indices, self.paged_kv_last_page_len,
@@ -142,6 +143,7 @@ class FlashInferMetadata(AttentionMetadata):
                     self.device)
 
             assert self.decode_wrapper is not None
+            self.decode_wrapper.end_forward()
             self.decode_wrapper.begin_forward(
                 self.paged_kv_indptr,
                 self.paged_kv_indices,
-- 
GitLab


From c87ebc3ef9ae6e8d6babbca782510ff924b3abc7 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Mon, 1 Jul 2024 16:17:58 -0700
Subject: [PATCH 231/376] [BugFix] Ensure worker model loop is always stopped
 at the right time (#5987)

---
 vllm/engine/llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f7e38c0e6..5886ebc24 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -838,7 +838,7 @@ class LLMEngine:
         # Tracing
         self.do_tracing(scheduler_outputs)
 
-        if not request_outputs:
+        if not self.has_unfinished_requests():
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
             # torch.distributed ops which may otherwise timeout, and unblocks
-- 
GitLab


From e373853e12c890964e21fa0ac9b46fee48fa7c76 Mon Sep 17 00:00:00 2001
From: James Whedbee <jamesw@telnyx.com>
Date: Mon, 1 Jul 2024 18:39:10 -0500
Subject: [PATCH 232/376] [Frontend] Relax api url assertion for openai
 benchmarking (#6046)

---
 benchmarks/backend_request_func.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index fd2461ce5..fe29c6708 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -225,8 +225,8 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(
-        "v1/completions"
-    ), "OpenAI Completions API URL must end with 'v1/completions'."
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert not request_func_input.use_beam_search
@@ -304,8 +304,8 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(
-        "v1/chat/completions"
-    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert not request_func_input.use_beam_search
-- 
GitLab


From 54600709b6d419fb243ce718a48ab7d40f5c3eb7 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 2 Jul 2024 01:40:02 +0200
Subject: [PATCH 233/376] [Model] Changes to MLPSpeculator to support
 tie_weights and input_scale (#5965)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Joshua Rosenkranz <jmrosenk@us.ibm.com>
---
 vllm/model_executor/models/mlp_speculator.py  | 94 ++++++++++++++-----
 .../configs/mlp_speculator.py                 | 12 +++
 2 files changed, 81 insertions(+), 25 deletions(-)

diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 6e6b2d8a7..290a703af 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -13,6 +13,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import MLPSpeculatorConfig
 
+SQRT2 = 2**0.5
+
 
 class MLPSpeculatorLayerNorm(nn.Module):
     """
@@ -26,24 +28,30 @@ class MLPSpeculatorLayerNorm(nn.Module):
         Safety term to prevent division by zero. Make sure the chosen value
          fits in the range of your encoding scheme
          (i.e. fp16 requires eps >= 6e-8).
+    elementwise_scale_and_shift : bool
+        Include a learned scaling and shift term after normalization.
     """
 
     def __init__(
         self,
         normalized_shape,
         eps=1e-06,
+        elementwise_scale_and_shift=True,
     ):
         super(MLPSpeculatorLayerNorm, self).__init__()
-        self.weight = nn.Parameter(torch.empty(normalized_shape))
-        self.bias = nn.Parameter(torch.empty(normalized_shape))
+        self.elementwise_scale_and_shift = elementwise_scale_and_shift
+        if self.elementwise_scale_and_shift:
+            self.weight = nn.Parameter(torch.empty(normalized_shape))
+            self.bias = nn.Parameter(torch.empty(normalized_shape))
         self.eps = eps
 
     def forward(self, x):
         xf = x
         xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
         x = xf.type_as(x)
-        x = self.weight * x
-        x = x + self.bias
+        if self.elementwise_scale_and_shift:
+            x = self.weight * x
+            x = x + self.bias
         return x
 
 
@@ -59,27 +67,60 @@ class MLPSpeculator(nn.Module):
 
         self.max_speculative_tokens = config.num_lookahead_tokens
 
-        self.emb = nn.ModuleList([
-            VocabParallelEmbedding(config.vocab_size,
-                                   self.inner_dim,
-                                   org_num_embeddings=config.vocab_size)
-            for _ in range(self.max_speculative_tokens)
-        ])
-
-        self.proj = nn.ModuleList([
-            nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
-                      self.inner_dim,
-                      bias=False) for i in range(self.max_speculative_tokens)
-        ])
-
-        self.head = nn.ModuleList([
-            nn.Linear(self.inner_dim, self.vocab_size, bias=False)
-            for _ in range(self.max_speculative_tokens)
-        ])
-        self.ln = nn.ModuleList([
-            MLPSpeculatorLayerNorm(self.inner_dim)
-            for _ in range(self.max_speculative_tokens)
-        ])
+        self.tie_weights = config.tie_weights
+        self.scale_input = config.scale_input
+
+        if self.tie_weights:
+            assert (
+                self.n_predict >
+                1), "You cannot tie weights between stages when only 1 exists"
+            embedding = VocabParallelEmbedding(
+                config.vocab_size,
+                self.inner_dim,
+                org_num_embeddings=config.vocab_size)
+            self.emb = nn.ModuleList([embedding] * self.max_speculative_tokens)
+
+            # the initial projection from the base model may
+            # have a different size, so that stays separate.
+            proj_first = nn.Linear(self.emb_dim, self.inner_dim, bias=False)
+            proj_tied = nn.Linear(self.inner_dim, self.inner_dim, bias=False)
+            self.proj = nn.ModuleList([proj_first] + [proj_tied] *
+                                      (self.max_speculative_tokens - 1))
+
+            head = nn.Linear(self.inner_dim, self.vocab_size, bias=False)
+            self.head = nn.ModuleList([head] * self.max_speculative_tokens)
+
+            ln = MLPSpeculatorLayerNorm(self.inner_dim,
+                                        elementwise_scale_and_shift=True)
+            self.ln = nn.ModuleList([ln] * self.max_speculative_tokens)
+
+        else:
+            self.emb = nn.ModuleList([
+                VocabParallelEmbedding(config.vocab_size,
+                                       self.inner_dim,
+                                       org_num_embeddings=config.vocab_size)
+                for _ in range(self.max_speculative_tokens)
+            ])
+
+            self.proj = nn.ModuleList([
+                nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
+                          self.inner_dim,
+                          bias=False)
+                for i in range(self.max_speculative_tokens)
+            ])
+
+            self.head = nn.ModuleList([
+                nn.Linear(self.inner_dim, self.vocab_size, bias=False)
+                for _ in range(self.max_speculative_tokens)
+            ])
+            self.ln = nn.ModuleList([
+                MLPSpeculatorLayerNorm(self.inner_dim,
+                                       elementwise_scale_and_shift=True)
+                for _ in range(self.max_speculative_tokens)
+            ])
+        if self.scale_input:
+            self.ln0 = MLPSpeculatorLayerNorm(
+                self.emb_dim, elementwise_scale_and_shift=False)
 
         self.state_weight = 0.5**(0.5 / config.n_predict)
         self.emb_weight = math.sqrt(
@@ -105,6 +146,9 @@ class MLPSpeculator(nn.Module):
         # b x 1 x d
         previous_hidden_states = previous_hidden_states.unsqueeze(1)
 
+        if self.scale_input:
+            previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2
+
         # b x 1
         last_tokens = input_ids.unsqueeze(1)
 
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index e1c1f4a96..946af4e91 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -17,6 +17,8 @@ class MLPSpeculatorConfig(PretrainedConfig):
                  n_predict: int = 3,
                  top_k_tokens_per_head: Optional[List[int]] = None,
                  n_candidates: int = 5,
+                 tie_weights: bool = False,
+                 scale_input: bool = False,
                  **kwargs):
         """
         Initialize an MLPSpeculatorConfig
@@ -38,6 +40,14 @@ class MLPSpeculatorConfig(PretrainedConfig):
                 NOTE: This parameter is currently unused.
             n_candidates: int
                 number of child candidates to create per sequence
+            tie_weights: bool
+                If true, use a single set of weights for every model
+                head/stage after the first. The initial projection
+                from the base model may have a different size, so that
+                stays separate.
+            scale_input: bool
+                if True, will scale the initial hidden states from
+                the base model.
         """
         if top_k_tokens_per_head is None:
             top_k_tokens_per_head = [5, 4, 3]
@@ -49,5 +59,7 @@ class MLPSpeculatorConfig(PretrainedConfig):
         self.top_k_tokens_per_head = top_k_tokens_per_head
         self.n_candidates = n_candidates
         self.num_lookahead_tokens = n_predict
+        self.tie_weights = tie_weights
+        self.scale_input = scale_input
 
         super().__init__(**kwargs)
-- 
GitLab


From 3476ed0809ec91a3457da0cb90543133a4f4b519 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Mon, 1 Jul 2024 23:10:37 -0400
Subject: [PATCH 234/376] [Core] Optimize block_manager_v2 vs block_manager_v1
 (to make V2 default)  (#5602)

---
 benchmarks/benchmark_latency.py               |   4 +
 tests/conftest.py                             |   2 +-
 tests/core/block/test_block_table.py          |   5 +-
 .../block/test_cpu_gpu_block_allocator.py     |  24 +-
 tests/core/block/test_naive_block.py          |   6 +-
 tests/core/block/test_prefix_caching_block.py | 106 ++-
 tests/spec_decode/test_batch_expansion.py     |   8 +-
 vllm/core/block/block_table.py                |  85 ++-
 vllm/core/block/common.py                     | 198 +++--
 vllm/core/block/cpu_gpu_block_allocator.py    |  84 ++-
 vllm/core/block/interfaces.py                 |  56 +-
 vllm/core/block/naive_block.py                | 216 ++++--
 vllm/core/block/prefix_caching_block.py       | 693 ++++++++++++------
 vllm/core/block_manager_v2.py                 | 150 ++--
 vllm/engine/llm_engine.py                     |   5 +-
 vllm/entrypoints/openai/serving_completion.py |   2 +-
 vllm/model_executor/sampling_metadata.py      |   4 +-
 vllm/outputs.py                               |   4 +-
 vllm/sequence.py                              |  69 +-
 19 files changed, 1189 insertions(+), 532 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index a46ee1581..8d0554b0f 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -46,6 +46,7 @@ def main(args: argparse.Namespace):
         load_format=args.load_format,
         distributed_executor_backend=args.distributed_executor_backend,
         otlp_traces_endpoint=args.otlp_traces_endpoint,
+        enable_prefix_caching=args.enable_prefix_caching,
     )
 
     sampling_params = SamplingParams(
@@ -220,6 +221,9 @@ if __name__ == '__main__':
         action='store_true',
         help='If True, the prefill requests can be chunked based on the '
         'max_num_batched_tokens')
+    parser.add_argument("--enable-prefix-caching",
+                        action='store_true',
+                        help="Enable automatic prefix caching")
     parser.add_argument('--use-v2-block-manager', action='store_true')
     parser.add_argument(
         "--ray-workers-use-nsight",
diff --git a/tests/conftest.py b/tests/conftest.py
index 0bd24905e..ac802d03b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -474,7 +474,7 @@ class VllmRunner:
             req_sample_output_strs: List[str] = []
             for sample in req_output.outputs:
                 output_str = sample.text
-                output_ids = sample.token_ids
+                output_ids = list(sample.token_ids)
                 req_sample_output_ids.append(prompt_ids + output_ids)
                 req_sample_output_strs.append(prompt_str + output_str)
             outputs.append((req_sample_output_ids, req_sample_output_strs))
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index 496774c8d..e2391a568 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -373,8 +373,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
                                    block_size) - (sequence_len // block_size)
 
     original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    original_block_ids = original_block_table.physical_block_ids
+    original_block_ids = original_block_table.physical_block_ids[:]
 
+    print("original_block_ids = {}".format(original_block_ids))
     forked_block_table = original_block_table.fork()
 
     # Expect no additional allocation (copy on _write_).
@@ -457,7 +458,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
 
     # Allocate lookahead slots.
     original_block_table.ensure_num_empty_slots(lookahead_slots)
-    original_block_ids = original_block_table.physical_block_ids
+    original_block_ids = original_block_table.physical_block_ids[:]
 
     forked_block_table = original_block_table.fork()
 
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index 44a5be6c1..15b76d909 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -8,8 +8,8 @@ from vllm.utils import Device, chunk_list
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
-                          block_size: int, allocator_type: str):
+def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                block_size: int, allocator_type: str):
     allocator = CpuGpuBlockAllocator.create(
         allocator_type=allocator_type,
         num_gpu_blocks=num_gpu_blocks,
@@ -21,14 +21,14 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     cpu_blocks = [
-        allocator.allocate_mutable(prev_block=None, device=Device.CPU)
+        allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
         for _ in range(num_cpu_blocks)
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     gpu_blocks = [
-        allocator.allocate_mutable(prev_block=None, device=Device.GPU)
+        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
         for _ in range(num_gpu_blocks)
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
@@ -47,8 +47,8 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
 @pytest.mark.parametrize("block_size", [2])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
-                            block_size: int, allocator_type: str):
+def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                  block_size: int, allocator_type: str):
     allocator = CpuGpuBlockAllocator.create(
         allocator_type=allocator_type,
         num_gpu_blocks=num_gpu_blocks,
@@ -67,18 +67,18 @@ def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     cpu_blocks = [
-        allocator.allocate_immutable(prev_block=None,
-                                     token_ids=token_ids,
-                                     device=Device.CPU)
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.CPU)
         for token_ids in cpu_token_ids
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     gpu_blocks = [
-        allocator.allocate_immutable(prev_block=None,
-                                     token_ids=token_ids,
-                                     device=Device.GPU)
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.GPU)
         for token_ids in gpu_token_ids
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index edcdc0c7d..9821ac41b 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -14,11 +14,11 @@ class TestNaiveBlockAllocator:
                                prev_block: Optional[Block],
                                token_ids: List[int]):
         if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable(
+            allocate_block = lambda: allocator.allocate_immutable_block(
                 prev_block=prev_block, token_ids=token_ids)
         elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable(prev_block=
-                                                                prev_block)
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
         else:
             raise ValueError()
 
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index fcf32cbe9..95858268a 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -26,11 +26,10 @@ class TestPrefixCachingBlock:
         token_ids = list(range(num_to_fill))
         mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
 
-        block_with_prev = PrefixCachingBlock(
-            prev_block=None,
-            token_ids=token_ids,
-            block_size=block_size,
-            prefix_caching_allocator=mock_allocator)
+        block_with_prev = PrefixCachingBlock(prev_block=None,
+                                             token_ids=token_ids,
+                                             block_size=block_size,
+                                             allocator=mock_allocator)
 
         if is_curr_block_full:
             # Expect hash since block is full.
@@ -71,7 +70,7 @@ class TestPrefixCachingBlock:
             prev_block=previous_block,
             token_ids=token_ids,
             block_size=block_size,
-            prefix_caching_allocator=mock_allocator,
+            allocator=mock_allocator,
         )
 
         if is_curr_block_full and prev_block_has_hash:
@@ -138,7 +137,7 @@ class TestPrefixCachingBlock:
                 prev_block=prev_block,
                 token_ids=[],
                 block_size=block_size,
-                prefix_caching_allocator=allocator,
+                allocator=allocator,
             )
 
             tokens_to_append = token_ids[block_number *
@@ -159,11 +158,11 @@ class TestPrefixCachingBlockAllocator:
                                prev_block: Optional[Block],
                                token_ids: List[int]):
         if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable(
+            allocate_block = lambda: allocator.allocate_immutable_block(
                 prev_block=prev_block, token_ids=token_ids)
         elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable(prev_block=
-                                                                prev_block)
+            allocate_block = lambda: allocator.allocate_mutable_block(
+                prev_block=prev_block)
         else:
             raise ValueError()
 
@@ -233,12 +232,13 @@ class TestPrefixCachingBlockAllocator:
 
         # Expect allocation with unseen hash to fail.
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_immutable(prev_block=chain[-1],
-                                         token_ids=list(range(block_size)))
+            allocator.allocate_immutable_block(prev_block=chain[-1],
+                                               token_ids=list(
+                                                   range(block_size)))
 
         # Expect mutable allocation to fail.
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable(prev_block=chain[-1])
+            allocator.allocate_mutable_block(prev_block=chain[-1])
 
         # Expect allocation of exact same chain to pass.
         second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
@@ -270,7 +270,7 @@ class TestPrefixCachingBlockAllocator:
 
         # Expect mutable allocation to fail.
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable(prev_block=None)
+            allocator.allocate_mutable_block(prev_block=None)
 
         block_to_free = chain[-1]
 
@@ -280,11 +280,11 @@ class TestPrefixCachingBlockAllocator:
             allocator.free(block_to_free)
             assert block_to_free.block_id is None, i
 
-            new_block = allocator.allocate_mutable(prev_block=None)
+            new_block = allocator.allocate_mutable_block(prev_block=None)
             assert new_block.block_id == block_id, i
 
             with pytest.raises(BlockAllocator.NoFreeBlocksError):
-                allocator.allocate_mutable(prev_block=None)
+                allocator.allocate_mutable_block(prev_block=None)
 
             block_to_free = new_block
 
@@ -376,7 +376,6 @@ class TestPrefixCachingBlockAllocator:
 
         # Create token ids that will exhaust all blocks.
         token_ids = list(range(num_blocks_to_consume * block_size))
-        blocks = list(range(num_blocks_to_consume))
 
         first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
             block_size=block_size,
@@ -384,9 +383,6 @@ class TestPrefixCachingBlockAllocator:
             allocator=allocator,
         )
 
-        # mark all blocks in first chain as computed
-        allocator.mark_blocks_as_computed(blocks)
-
         # After zero_point, second_chain's token_ids would be set -1, which
         # make it different from here comparing with first_chain
         zero_point = random.randint(1, len(token_ids) - 1)
@@ -424,15 +420,16 @@ class TestPrefixCachingBlockAllocator:
                                                 block_size=block_size)
         token_ids = list(range(block_size))
 
-        block = allocator.allocate_immutable(prev_block=None,
-                                             token_ids=token_ids)
+        block = allocator.allocate_immutable_block(prev_block=None,
+                                                   token_ids=token_ids)
 
         assert allocator._refcounter.get(block.block_id) == 1
-        m = allocator.allocate_mutable(prev_block=None)
+        m = allocator.allocate_mutable_block(prev_block=None)
 
         block_id = m.block_id
         for i in range(block_size):
             m.append_token_ids([i])
+
         # After block get promoted to immutable from mutable, if there is
         # already same content hash block, then it shall be released into
         # hashless_allocator
@@ -452,48 +449,79 @@ class TestPrefixCachingBlockAllocator:
 
         all_blocks_list = [i for i in range(num_blocks)]
         zero_ref = {i: 0 for i in range(num_blocks)}
+        one_ref = {i: 1 for i in range(num_blocks)}
         allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
                                                 block_size=block_size)
         token_ids = list(range(num_blocks * block_size))
 
-        # now we have num_blocks free blocks in hashless allocator
-        # with internal tracking list _blocks _cached_blocks and evictor
-        # empty and block's ref shall be 0
+        # Verify initial/pre-alloc state
+
+        # Ensure all blocks are free inside hashless allocator
         assert list(allocator._hashless_allocator._free_block_indices
                     ) == all_blocks_list
-        assert len(allocator._blocks.keys()) == 0
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no cached blocks
         assert len(allocator._cached_blocks.values()) == 0
+        # Ensure no evicted blocks
         assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 0s ref counts for all blocks
         assert allocator._refcounter._refcounts == zero_ref
 
         # Allocate immutable chains with only one block residuled in
         new_block = []
         for i in range(num_blocks):
-            block = allocator.allocate_immutable(
+            block = allocator.allocate_immutable_block(
                 prev_block=None,
                 token_ids=token_ids[block_size * i:block_size * (i + 1)])
             new_block.append(block)
 
+        # Verify post-alloc state
+
+        # Ensure no blocks are free inside hashless allocator
+        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
+        # Ensure all blocks are tracked
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert allocator._block_tracker[block_id].active
+        # Ensure all blocks are cached (all promoted)
+        assert len(allocator._cached_blocks.values()) == num_blocks
+        # Ensure no evicted blocks
+        assert len(allocator.evictor.free_table.keys()) == 0
+        # Ensure 1s ref counts for all blocks
+        assert allocator._refcounter._refcounts == one_ref
+
         # Free all blocks, and now all blocks shall be in the evictor
-        # there shall be no tracking data left in _blocks
+        # there shall be no tracking data left in _block_tracker
         # all blocks shall be tracked in _cached_blocks
         # all blocks' ref shall be zero
         for block in new_block:
             allocator.free(block)
 
-        assert len(allocator._blocks.keys()) == 0
+        # Verify post-free state
+
+        # Ensure no tracked blocks
+        assert len(allocator._block_tracker.keys()) == num_blocks
+        for block_id in range(num_blocks):
+            assert not allocator._block_tracker[block_id].active
+        # Ensure no blocks in hashless allocator (all promoted)
         assert len(allocator._hashless_allocator._free_block_indices) == 0
+        # Ensure all blocks are cached
         assert list(allocator._cached_blocks.values()) == all_blocks_list
+        # Ensure all blocks are inside the evictor
         assert list(allocator.evictor.free_table.keys()) == all_blocks_list
+        # Ensure 0s refcounts
         assert allocator._refcounter._refcounts == zero_ref
 
         # Allocate a mutable block, and the first block shall be evicted
         # and set its content hash into None, ref to 1
-        mutable = allocator.allocate_mutable(prev_block=None)
+        mutable = allocator.allocate_mutable_block(prev_block=None)
 
         assert mutable.block_id == 0
         assert mutable.content_hash is None
-        assert 0 in allocator._blocks
+        assert allocator._block_tracker[0].active
         assert allocator._refcounter.get(0) == 1
         assert 0 not in allocator._cached_blocks
         assert 0 not in allocator.evictor
@@ -502,27 +530,27 @@ class TestPrefixCachingBlockAllocator:
         # hashless allocator
         allocator.free(mutable)
 
-        assert len(allocator._blocks.keys()) == 0
+        assert not allocator._block_tracker[0].active
         assert allocator._refcounter._refcounts == zero_ref
         assert 0 not in allocator._cached_blocks
         assert 0 not in allocator.evictor
         assert 0 in allocator._hashless_allocator._free_block_indices
 
-        # when allocate immutable with first block_size tokens, we
+        # When allocate immutable with first block_size tokens, we
         # shall get free block from hashless allocator, thus no block left
         # in hashless
-        block = allocator.allocate_immutable(prev_block=None,
-                                             token_ids=token_ids[:block_size])
+        block = allocator.allocate_immutable_block(
+            prev_block=None, token_ids=token_ids[:block_size])
 
         assert block.block_id == 0
         assert len(allocator._hashless_allocator._free_block_indices) == 0
-        assert 0 in allocator._blocks
+        assert allocator._block_tracker[0].active
         assert 0 in allocator._cached_blocks.values()
         assert allocator._refcounter.get(0) == 1
         assert 0 not in allocator.evictor
 
         # allocate mutable block again, it shall be popped from evictor
-        mutable = allocator.allocate_mutable(prev_block=None)
+        mutable = allocator.allocate_mutable_block(prev_block=None)
         assert len(allocator._hashless_allocator._free_block_indices) == 0
         assert mutable.block_id not in allocator.evictor.free_table
         assert allocator._refcounter.get(mutable.block_id) == 1
@@ -619,7 +647,7 @@ class TestPrefixCachingBlockAllocator:
             block_token_ids = token_ids[block_number *
                                         block_size:(block_number + 1) *
                                         block_size]
-            prev_block = allocator.allocate_immutable(
+            prev_block = allocator.allocate_immutable_block(
                 prev_block=prev_block, token_ids=block_token_ids)
             blocks.append(prev_block)
 
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 42dd90422..c350a2c55 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -90,10 +90,10 @@ def test_create_single_target_seq_group_metadata(k: int):
 
     assert output.request_id == input_seq_group_metadata.request_id
     assert len(output.seq_data) == 1
-    assert output.seq_data[target_seq_id].get_prompt_token_ids(
-    ) == prompt_tokens
-    assert output.seq_data[target_seq_id].get_output_token_ids(
-    ) == prev_output_tokens + token_ids
+    assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple(
+        prompt_tokens)
+    assert output.seq_data[target_seq_id].get_output_token_ids() == tuple(
+        prev_output_tokens + token_ids)
 
     assert len(output.block_tables) == 1
     assert output.block_tables[
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d705f3d91..49e63c231 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -1,5 +1,6 @@
 from typing import List, Optional
 
+from vllm.core.block.common import BlockList
 from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
 from vllm.utils import Device, cdiv, chunk_list
 
@@ -47,12 +48,10 @@ class BlockTable:
         self._allocator = block_allocator
         if _blocks is None:
             _blocks = []
-        self._blocks: List[Block] = _blocks
+        self._blocks: BlockList = BlockList(_blocks)
 
         self._max_block_sliding_window = max_block_sliding_window
-        # Use helper method instead of directly calculating, as blocks
-        # may not be allocated.
-        self._num_full_slots = len(self._get_all_token_ids())
+        self._num_full_slots = self._get_num_token_ids()
 
     @staticmethod
     def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
@@ -88,11 +87,18 @@ class BlockTable:
         """
         assert not self._is_allocated
         assert token_ids
-        self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                           token_ids=token_ids,
-                                                           device=device)
+        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
+                                                     token_ids=token_ids,
+                                                     device=device)
+        self.update(blocks)
         self._num_full_slots = len(token_ids)
 
+    def update(self, blocks: List[Block]) -> None:
+        """Resets the table to the newly provided blocks 
+        (with their corresponding block ids)
+        """
+        self._blocks.update(blocks)
+
     def append_token_ids(self,
                          token_ids: List[int],
                          num_lookahead_slots: int = 0,
@@ -140,11 +146,11 @@ class BlockTable:
                                     num_lookahead_slots)
 
         # Update the blocks with the new tokens
-        blocks = self._blocks[self._num_full_slots // self._block_size:]
+        first_block_idx = self._num_full_slots // self._block_size
         token_blocks = self._chunk_token_blocks_for_append(token_ids)
 
-        for block, token_block in zip(blocks, token_blocks):
-            block.append_token_ids(token_block)
+        for i, token_block in enumerate(token_blocks):
+            self._blocks.append_token_ids(first_block_idx + i, token_block)
 
         self._num_full_slots += len(token_ids)
 
@@ -174,8 +180,8 @@ class BlockTable:
         for _ in range(blocks_to_allocate):
             assert len(self._blocks) > 0
             self._blocks.append(
-                self._allocator.allocate_mutable(prev_block=self._blocks[-1],
-                                                 device=device))
+                self._allocator.allocate_mutable_block(
+                    prev_block=self._blocks[-1], device=device))
 
     def fork(self) -> "BlockTable":
         """Creates a new BlockTable instance with a copy of the blocks from the
@@ -209,12 +215,12 @@ class BlockTable:
         is set to `None`.
         """
         assert self._is_allocated
-        for block in self._blocks:
+        for block in self.blocks:
             self._allocator.free(block)
-        self._blocks = []
+        self._blocks.reset()
 
     @property
-    def physical_block_ids(self) -> List[Optional[int]]:
+    def physical_block_ids(self) -> List[int]:
         """Returns a list of physical block indices for the blocks in the
         BlockTable.
 
@@ -228,7 +234,7 @@ class BlockTable:
                 BlockTable.
         """
         assert self._is_allocated
-        return [block.block_id for block in self._blocks]
+        return self._blocks.ids()
 
     def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
         """Get the number of "unseen" tokens in the sequence.
@@ -253,17 +259,31 @@ class BlockTable:
                                        token_ids: List[int],
                                        device: Device) -> List[Block]:
         blocks: List[Block] = []
-        for block_token_ids in chunk_list(token_ids, self._block_size):
-            if len(block_token_ids) == self._block_size:
-                # If the block is full, create an immutable block.
-                prev_block = self._allocator.allocate_immutable(
-                    prev_block, token_ids=block_token_ids, device=device)
+
+        block_token_ids = []
+        tail_token_ids = []
+        for cur_token_ids in chunk_list(token_ids, self._block_size):
+            if len(cur_token_ids) == self._block_size:
+                block_token_ids.append(cur_token_ids)
             else:
-                # Else, partially fill a mutable block with token ids.
-                prev_block = self._allocator.allocate_mutable(
-                    prev_block=prev_block, device=device)
-                prev_block.append_token_ids(block_token_ids)
-            blocks.append(prev_block)
+                tail_token_ids.append(cur_token_ids)
+
+        if block_token_ids:
+            blocks.extend(
+                self._allocator.allocate_immutable_blocks(
+                    prev_block, block_token_ids=block_token_ids,
+                    device=device))
+            prev_block = blocks[-1]
+
+        if tail_token_ids:
+            assert len(tail_token_ids) == 1
+            cur_token_ids = tail_token_ids[0]
+
+            block = self._allocator.allocate_mutable_block(
+                prev_block=prev_block, device=device)
+            block.append_token_ids(cur_token_ids)
+
+            blocks.append(block)
 
         return blocks
 
@@ -274,18 +294,25 @@ class BlockTable:
         if not self._is_allocated:
             return token_ids
 
-        for block in self._blocks:
+        for block in self.blocks:
             token_ids.extend(block.token_ids)
 
         return token_ids
 
+    def _get_num_token_ids(self) -> int:
+        res = 0
+        for block in self.blocks:
+            res += len(block.token_ids)
+
+        return res
+
     @property
     def _is_allocated(self) -> bool:
         return len(self._blocks) > 0
 
     @property
-    def blocks(self) -> Optional[List[Block]]:
-        return self._blocks
+    def blocks(self) -> List[Block]:
+        return self._blocks.list()
 
     @property
     def _num_empty_slots(self) -> int:
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index d2787d696..1e808e21b 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -1,4 +1,5 @@
-from typing import Dict, Iterable, List, Optional, Protocol, Tuple
+from collections import deque
+from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
 
 from vllm.core.block.interfaces import Block, BlockAllocator
 
@@ -95,64 +96,40 @@ class CopyOnWriteTracker:
 
     The CopyOnWriteTracker class maintains a mapping of source block indices to
         their corresponding copy-on-write destination block indices. It works in
-        conjunction with a RefCounter and a BlockAllocator to handle reference
-        counting and block allocation.
+        conjunction with a RefCounter.
 
     Args:
         refcounter (RefCounter): The reference counter used to track block
             reference counts.
-        allocator (BlockAllocator): The block allocator used to allocate and
-            free blocks.
     """
 
-    def __init__(
-        self,
-        refcounter: RefCounterProtocol,
-        allocator: BlockAllocator,
-    ):
+    def __init__(self, refcounter: RefCounterProtocol):
         self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
         self._refcounter = refcounter
-        self._allocator = allocator
-
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
-        """Performs a copy-on-write operation on the given block if it is not
-        appendable.
-
-        This method checks the reference count of the given block. If the
-        reference count is greater than 1, indicating that the block is shared,
-        a copy-on-write operation is performed. The original block is freed,
-        and a new block is allocated with the same content. The new block index
-        is returned.
-
-        Args:
-            block (Block): The block to check for copy-on-write.
 
-        Returns:
-            Optional[BlockId]: The block index of the new block if a copy-on
-                -write operation was performed, or the original block index if
-                no copy-on-write was necessary.
+    def is_appendable(self, block: Block) -> bool:
+        """Checks if the block is shared or not. If shared, then it cannot
+        be appended and needs to be duplicated via copy-on-write
         """
         block_id = block.block_id
         if block_id is None:
-            return block_id
+            return True
 
         refcount = self._refcounter.get(block_id)
-        assert refcount != 0
-        if refcount > 1:
-            src_block_id = block_id
-            # Decrement refcount of the old block.
-            self._allocator.free(block)
-
-            # Allocate a fresh new block.
-            block_id = self._allocator.allocate_mutable(
-                prev_block=block.prev_block).block_id
+        return refcount <= 1
 
-            # Track src/dst copy.
-            assert src_block_id is not None
-            assert block_id is not None
-            self._copy_on_writes.append((src_block_id, block_id))
-
-        return block_id
+    def record_cow(self, src_block_id: Optional[BlockId],
+                   trg_block_id: Optional[BlockId]) -> None:
+        """Records a copy-on-write operation from source to target block id
+        Args:
+            src_block_id (BlockId): The source block id from which to copy 
+                the data
+            trg_block_id (BlockId): The target block id to which the data
+                is copied
+        """
+        assert src_block_id is not None
+        assert trg_block_id is not None
+        self._copy_on_writes.append((src_block_id, trg_block_id))
 
     def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
         """Clears the copy-on-write tracking information and returns the current
@@ -172,6 +149,139 @@ class CopyOnWriteTracker:
         return cows
 
 
+class BlockPool:
+    """Used to pre-allocate block objects, in order to avoid excessive python
+    object allocations/deallocations.
+    The pool starts from "pool_size" objects and will increase to more objects
+    if necessary
+
+    Note that multiple block objects may point to the same physical block id,
+    which is why this pool is needed, so that it will be easier to support
+    prefix caching and more complicated sharing of physical blocks.
+    """
+
+    def __init__(self, block_size: int, create_block: Block.Factory,
+                 allocator: BlockAllocator, pool_size: int):
+        self._block_size = block_size
+        self._create_block = create_block
+        self._allocator = allocator
+        self._pool_size = pool_size
+        assert self._pool_size >= 0
+
+        self._free_ids: Deque[int] = deque(range(self._pool_size))
+        self._pool = []
+        for i in range(self._pool_size):
+            self._pool.append(
+                self._create_block(prev_block=None,
+                                   token_ids=[],
+                                   block_size=self._block_size,
+                                   allocator=self._allocator,
+                                   block_id=None))
+
+    def increase_pool(self):
+        """Doubles the internal pool size
+        """
+        cur_pool_size = self._pool_size
+        new_pool_size = cur_pool_size * 2
+        self._pool_size = new_pool_size
+
+        self._free_ids += deque(range(cur_pool_size, new_pool_size))
+
+        for i in range(cur_pool_size, new_pool_size):
+            self._pool.append(
+                self._create_block(prev_block=None,
+                                   token_ids=[],
+                                   block_size=self._block_size,
+                                   allocator=self._allocator,
+                                   block_id=None))
+
+    def init_block(self, prev_block: Optional[Block], token_ids: List[int],
+                   block_size: int, physical_block_id: Optional[int]) -> Block:
+        if len(self._free_ids) == 0:
+            self.increase_pool()
+            assert len(self._free_ids) > 0
+
+        pool_id = self._free_ids.popleft()
+
+        block = self._pool[pool_id]
+        block.__init__(  # type: ignore[misc]
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            allocator=block._allocator,  # type: ignore[attr-defined] 
+            block_id=physical_block_id)
+        block.pool_id = pool_id  # type: ignore[attr-defined]
+        return block
+
+    def free_block(self, block: Block) -> None:
+        self._free_ids.appendleft(block.pool_id)  # type: ignore[attr-defined]
+
+
+class BlockList:
+    """This class is an optimization to allow fast-access to physical 
+    block ids. It maintains a block id list that is updated with the 
+    block list and this avoids the need to reconstruct the block id 
+    list on every iteration of the block manager
+    """
+
+    def __init__(self, blocks: List[Block]):
+        self._blocks: List[Block] = []
+        self._block_ids: List[int] = []
+
+        self.update(blocks)
+
+    def _add_block_id(self, block_id: Optional[BlockId]) -> None:
+        assert block_id is not None
+        self._block_ids.append(block_id)
+
+    def _update_block_id(self, block_index: int,
+                         new_block_id: Optional[BlockId]) -> None:
+        assert new_block_id is not None
+        self._block_ids[block_index] = new_block_id
+
+    def update(self, blocks: List[Block]):
+        self._blocks = blocks
+
+        # Cache block ids for fast query
+        self._block_ids = []
+        for block in self._blocks:
+            self._add_block_id(block.block_id)
+
+    def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
+        block = self._blocks[block_index]
+        prev_block_id = block.block_id
+
+        block.append_token_ids(token_ids)
+
+        # CoW or promotion may update the internal block_id
+        if prev_block_id != block.block_id:
+            self._update_block_id(block_index, block.block_id)
+
+    def append(self, new_block: Block):
+        self._blocks.append(new_block)
+        self._add_block_id(new_block.block_id)
+
+    def __len__(self) -> int:
+        return len(self._blocks)
+
+    def __getitem__(self, block_index: int) -> Block:
+        return self._blocks[block_index]
+
+    def __setitem__(self, block_index: int, new_block: Block) -> None:
+        self._blocks[block_index] = new_block
+        self._update_block_id(block_index, new_block.block_id)
+
+    def reset(self):
+        self._blocks = []
+        self._block_ids = []
+
+    def list(self) -> List[Block]:
+        return self._blocks
+
+    def ids(self) -> List[int]:
+        return self._block_ids
+
+
 def get_all_blocks_recursively(last_block: Block) -> List[Block]:
     """Retrieves all the blocks in a sequence starting from the last block.
 
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 255aae9d1..5287cd9c1 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -113,11 +113,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
     def allocate_or_get_null_block(self) -> Block:
         if self._null_block is None:
             self._null_block = NullBlock(
-                self.allocate_mutable(None, Device.GPU))
+                self.allocate_mutable_block(None, Device.GPU))
         return self._null_block
 
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               device: Device) -> Block:
         """Allocates a new mutable block on the specified device.
 
         Args:
@@ -128,10 +128,31 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
         Returns:
             Block: The newly allocated mutable block.
         """
-        return self._allocators[device].allocate_mutable(prev_block)
+        return self._allocators[device].allocate_mutable_block(prev_block)
 
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int], device: Device) -> Block:
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  device: Optional[Device]) -> List[Block]:
+        """Allocates a new group of immutable blocks with the provided block 
+        token IDs on the specified device.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+                Used for prefix hashing.
+            block_token_ids (List[int]): The list of block token IDs to be 
+                stored in the new blocks.
+            device (Device): The device on which to allocate the new block.
+
+        Returns:
+            List[Block]: The newly allocated list of immutable blocks 
+                containing the provided block token IDs.
+        """
+        return self._allocators[device].allocate_immutable_blocks(
+            prev_block, block_token_ids)
+
+    def allocate_immutable_block(self, prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Device) -> Block:
         """Allocates a new immutable block with the provided token IDs on the
         specified device.
 
@@ -146,7 +167,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
             Block: The newly allocated immutable block containing the provided
                 token IDs.
         """
-        return self._allocators[device].allocate_immutable(
+        return self._allocators[device].allocate_immutable_block(
             prev_block, token_ids)
 
     def free(self, block: Block) -> None:
@@ -161,7 +182,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
         block_id = block.block_id
         assert block_id is not None
         allocator = self._block_ids_to_allocator[block_id]
-        return allocator.free(block)
+        allocator.free(block)
 
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
@@ -210,8 +231,8 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
         """
         return self._allocators[device].get_physical_block_id(absolute_id)
 
-    def swap(self, blocks: List[Block], source_device: Device,
-             dest_device: Device) -> Dict[int, int]:
+    def swap(self, blocks: List[Block], src_device: Device,
+             dst_device: Device) -> Dict[int, int]:
         """Execute the swap for the given blocks from source_device
         on to dest_device, save the current swap mapping and append 
         them to the accumulated `self._swap_mapping` for each 
@@ -219,23 +240,23 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
 
         Args:
             blocks: List of blocks to be swapped.
-            source_device (Device): Device to swap the 'blocks' from.
-            dest_device (Device): Device to swap the 'blocks' to.
+            src_device (Device): Device to swap the 'blocks' from.
+            dst_device (Device): Device to swap the 'blocks' to.
         
         Returns:
             Dict[int, int]: Swap mapping from source_device
                 on to dest_device.
         """
-        source_block_ids = [block.block_id for block in blocks]
-        self._allocators[source_device].swap_out(blocks)
-        self._allocators[dest_device].swap_in(blocks)
-        dest_block_ids = [block.block_id for block in blocks]
+        src_block_ids = [block.block_id for block in blocks]
+        self._allocators[src_device].swap_out(blocks)
+        self._allocators[dst_device].swap_in(blocks)
+        dst_block_ids = [block.block_id for block in blocks]
 
         current_swap_mapping: Dict[int, int] = {}
-        for src, dest in zip(source_block_ids, dest_block_ids):
-            if src is not None and dest is not None:
-                self._swap_mapping[src] = dest
-                current_swap_mapping[src] = dest
+        for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
+            if src_block_id is not None and dst_block_id is not None:
+                self._swap_mapping[src_block_id] = dst_block_id
+                current_swap_mapping[src_block_id] = dst_block_id
         return current_swap_mapping
 
     def get_num_blocks_touched(self,
@@ -283,23 +304,25 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
         device = Device.GPU
         return self._allocators[device].mark_blocks_as_computed(block_ids)
 
+    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool) -> List[int]:
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].get_computed_block_ids(
+            prev_computed_block_ids, block_ids, skip_last_block_id)
+
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         # Prefix caching only supported on GPU.
         device = Device.GPU
         return self._allocators[device].get_common_computed_block_ids(
-            seq_block_ids)
+            computed_seq_block_ids)
 
     @property
     def all_block_ids(self) -> FrozenSet[int]:
         return frozenset(self._block_ids_to_allocator.keys())
 
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        raise NotImplementedError
-
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
-        raise NotImplementedError
-
     def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
         """Returns and clears the mapping of source to destination block IDs.
         Will be called after every swapping operations for now, and after every
@@ -341,6 +364,11 @@ class NullBlock(Block):
     def token_ids(self) -> List[BlockId]:
         return self._proxy.token_ids
 
+    @property
+    def num_tokens_total(self) -> int:
+        raise NotImplementedError(
+            "num_tokens_total is not used for null block")
+
     @property
     def num_empty_slots(self) -> BlockId:
         return self._proxy.num_empty_slots
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 4b20856a1..ab39832bc 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -28,6 +28,13 @@ class Block(ABC):
     def token_ids(self) -> List[int]:
         pass
 
+    @property
+    @abstractmethod
+    def num_tokens_total(self) -> int:
+        """The number of tokens till the current block (inclusive)
+        """
+        pass
+
     @property
     @abstractmethod
     def num_empty_slots(self) -> int:
@@ -92,12 +99,18 @@ class Block(ABC):
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block]) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int]) -> Block:
+    def allocate_immutable_block(self, prev_block: Optional[Block],
+                                 token_ids: List[int]) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable_blocks(
+            self, prev_block: Optional[Block],
+            block_token_ids: List[List[int]]) -> List[Block]:
         pass
 
     @abstractmethod
@@ -146,13 +159,19 @@ class BlockAllocator(ABC):
     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         pass
 
+    @abstractmethod
+    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool) -> List[int]:
+        pass
+
     @abstractmethod
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         pass
 
     @abstractmethod
-    def cow_block_if_not_appendable(self, block: Block) -> Optional["BlockId"]:
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
         """NOTE: This should not be used besides Block"""
         pass
 
@@ -174,13 +193,20 @@ class BlockAllocator(ABC):
 class DeviceAwareBlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               device: Device) -> Block:
+        pass
+
+    @abstractmethod
+    def allocate_immutable_block(self, prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Device) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable(self, prev_block: Optional[Block],
-                           token_ids: List[int], device: Device) -> Block:
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  device: Device) -> List[Block]:
         pass
 
     @abstractmethod
@@ -217,9 +243,15 @@ class DeviceAwareBlockAllocator(ABC):
     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
         pass
 
+    @abstractmethod
+    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool) -> List[int]:
+        pass
+
     @abstractmethod
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         pass
 
     @abstractmethod
@@ -230,8 +262,8 @@ class DeviceAwareBlockAllocator(ABC):
         pass
 
     @abstractmethod
-    def swap(self, blocks: List[Block], source_device: Device,
-             dest_device: Device) -> Dict[int, int]:
+    def swap(self, blocks: List[Block], src_device: Device,
+             dst_device: Device) -> Dict[int, int]:
         pass
 
     @abstractmethod
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 50f27bab3..0c1e88314 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,6 +1,7 @@
-from typing import FrozenSet, Iterable, List, Optional, Set, Tuple
+from collections import deque
+from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple
 
-from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
+from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
                                     get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
 from vllm.utils import cdiv
@@ -31,28 +32,39 @@ class NaiveBlockAllocator(BlockAllocator):
         num_blocks: int,
         block_size: int,
         block_ids: Optional[Iterable[int]] = None,
+        block_pool: Optional[BlockPool] = None,
     ):
         if block_ids is None:
             block_ids = range(num_blocks)
 
-        self._free_block_indices: Set[BlockId] = set(block_ids)
+        self._free_block_indices: Deque[BlockId] = deque(block_ids)
         self._all_block_indices = frozenset(block_ids)
         assert len(self._all_block_indices) == num_blocks
 
         self._refcounter = RefCounter(
             all_block_indices=self._free_block_indices)
-        self._create_block = create_block
         self._block_size = block_size
 
         self._cow_tracker = CopyOnWriteTracker(
-            refcounter=self._refcounter.as_readonly(),
-            allocator=self,
-        )
-
-    def allocate_immutable(self,
-                           prev_block: Optional[Block],
-                           token_ids: List[int],
-                           device: Optional[Device] = None) -> Block:
+            refcounter=self._refcounter.as_readonly())
+
+        if block_pool is None:
+            extra_factor = 4
+            # Pre-allocate "num_blocks * extra_factor" block objects.
+            # The "* extra_factor" is a buffer to allow more block objects
+            # than physical blocks
+            self._block_pool = BlockPool(self._block_size, create_block, self,
+                                         num_blocks * extra_factor)
+        else:
+            # In this case, the block pool is provided by the caller,
+            # which means that there is most likely a need to share
+            # a block pool between allocators
+            self._block_pool = block_pool
+
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Optional[Device] = None) -> Block:
         """Allocates a new immutable block with the given token IDs, linked to
         the previous block.
 
@@ -66,13 +78,36 @@ class NaiveBlockAllocator(BlockAllocator):
             Block: The newly allocated immutable block.
         """
         assert device is None
-        block = self.allocate_mutable(prev_block=prev_block)
+        block = self.allocate_mutable_block(prev_block=prev_block)
         block.append_token_ids(token_ids)
         return block
 
-    def allocate_mutable(self,
-                         prev_block: Optional[Block],
-                         device: Optional[Device] = None) -> Block:
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Optional[Device] = None) -> List[Block]:
+        assert device is None
+        num_blocks = len(block_token_ids)
+
+        block_ids = []
+        for i in range(num_blocks):
+            block_ids.append(self._allocate_block_id())
+
+        blocks = []
+        for i in range(num_blocks):
+            prev_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block_token_ids[i],
+                block_size=self._block_size,
+                physical_block_id=block_ids[i])
+            blocks.append(prev_block)
+
+        return blocks
+
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Optional[Device] = None) -> Block:
         """Allocates a new mutable block, linked to the previous block.
 
         Args:
@@ -84,20 +119,39 @@ class NaiveBlockAllocator(BlockAllocator):
             Block: The newly allocated mutable block.
         """
         assert device is None
-        block_id = self._allocate_new_block_id()
-        return self._create_block(
-            prev_block=prev_block,
-            token_ids=[],
-            block_id=block_id,
-            block_size=self._block_size,
-            allocator=self,
-        )
-
-    def free(self, block: Block) -> None:
-        assert block.block_id is not None
-        self._free_block_id(block.block_id)
+        block_id = self._allocate_block_id()
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=[],
+                                            block_size=self._block_size,
+                                            physical_block_id=block_id)
+        return block
+
+    def _allocate_block_id(self) -> BlockId:
+        if not self._free_block_indices:
+            raise BlockAllocator.NoFreeBlocksError()
+
+        block_id = self._free_block_indices.popleft()
+        self._refcounter.incr(block_id)
+        return block_id
+
+    def _free_block_id(self, block: Block) -> None:
+        block_id = block.block_id
+        assert block_id is not None
+
+        refcount = self._refcounter.decr(block_id)
+        if refcount == 0:
+            self._free_block_indices.appendleft(block_id)
+
         block.block_id = None
 
+    def free(self, block: Block, keep_block_object: bool = False) -> None:
+        # Release the physical block id
+        self._free_block_id(block)
+
+        # Release the block object
+        if not keep_block_object:
+            self._block_pool.free_block(block)
+
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
         memory as the original sequence.
@@ -120,14 +174,13 @@ class NaiveBlockAllocator(BlockAllocator):
             refcount = self._refcounter.incr(block.block_id)
             assert refcount != 1, "can't fork free'd block"
 
-            forked_blocks.append(
-                self._create_block(
-                    prev_block=prev_block,
-                    token_ids=block.token_ids,
-                    block_id=block.block_id,
-                    block_size=self._block_size,
-                    allocator=self,
-                ))
+            forked_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block.token_ids,
+                block_size=self._block_size,
+                physical_block_id=block.block_id)
+
+            forked_blocks.append(forked_block)
             prev_block = forked_blocks[-1]
 
         return forked_blocks
@@ -138,20 +191,6 @@ class NaiveBlockAllocator(BlockAllocator):
     def get_num_total_blocks(self) -> int:
         return len(self._all_block_indices)
 
-    def _allocate_new_block_id(self) -> BlockId:
-        if not self._free_block_indices:
-            raise BlockAllocator.NoFreeBlocksError()
-
-        block_id = next(iter(self._free_block_indices))
-        self._refcounter.incr(block_id)
-        self._free_block_indices.remove(block_id)
-        return block_id
-
-    def _free_block_id(self, block_id: BlockId) -> None:
-        refcount = self._refcounter.decr(block_id)
-        if refcount == 0:
-            self._free_block_indices.add(block_id)
-
     def get_physical_block_id(self, absolute_id: int) -> int:
         """Returns the zero-offset block id on certain block allocator
         given the absolute block id.
@@ -173,7 +212,7 @@ class NaiveBlockAllocator(BlockAllocator):
     def all_block_ids(self) -> FrozenSet[int]:
         return self._all_block_indices
 
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
         """Performs a copy-on-write operation on the given block if it is not
         appendable.
 
@@ -181,11 +220,22 @@ class NaiveBlockAllocator(BlockAllocator):
             block (Block): The block to check for copy-on-write.
 
         Returns:
-            Optional[BlockId]: The block index of the new block if a copy-on
-                -write operation was performed, or the original block index if
+            BlockId: The block index of the new block if a copy-on-write 
+                operation was performed, or the original block index if
                 no copy-on-write was necessary.
         """
-        return self._cow_tracker.cow_block_if_not_appendable(block)
+        src_block_id = block.block_id
+        assert src_block_id is not None
+
+        if self._cow_tracker.is_appendable(block):
+            return src_block_id
+
+        self._free_block_id(block)
+        trg_block_id = self._allocate_block_id()
+
+        self._cow_tracker.record_cow(src_block_id, trg_block_id)
+
+        return trg_block_id
 
     def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
         """Returns the copy-on-write source->destination mapping and clears it.
@@ -213,8 +263,15 @@ class NaiveBlockAllocator(BlockAllocator):
         """
         pass
 
+    def get_computed_block_ids(self, prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool) -> List[int]:
+        """No prefix caching here => return empty list
+        """
+        return []
+
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         """Determine blocks that can be skipped in prefill.
 
         Since the naive allocator does not support prefix caching, always return
@@ -223,7 +280,7 @@ class NaiveBlockAllocator(BlockAllocator):
         return []
 
     def promote_to_immutable_block(self, block: Block) -> BlockId:
-        raise NotImplementedError
+        raise NotImplementedError("There is no promotion for naive blocks")
 
     def get_num_blocks_touched(self,
                                blocks: List[Block],
@@ -263,17 +320,27 @@ class NaiveBlockAllocator(BlockAllocator):
 
     def swap_out(self, blocks: List[Block]) -> None:
         for block in blocks:
-            self.free(block)
+            self._free_block_id(block)
 
     def swap_in(self, blocks: List[Block]) -> None:
         for block in blocks:
+            # Here we allocate either immutable or mutable block and then
+            # extract its block_id. Note that the block object is released
+            # and the block_id is assigned to "block" to allow reusing the
+            # existing "block" object
             if block.is_full:
-                alloc = self.allocate_immutable(block.prev_block,
-                                                block.token_ids)
+                tmp_block = self.allocate_immutable_block(
+                    prev_block=block.prev_block, token_ids=block.token_ids)
             else:
-                alloc = self.allocate_mutable(block.prev_block)
-                alloc.append_token_ids(block.token_ids)
-            block.block_id = alloc.block_id
+                tmp_block = self.allocate_mutable_block(
+                    prev_block=block.prev_block)
+                tmp_block.append_token_ids(block.token_ids)
+
+            block_id = tmp_block.block_id
+            tmp_block.block_id = None
+            self._block_pool.free_block(tmp_block)
+
+            block.block_id = block_id  # Assign block_id
 
 
 class NaiveBlock(Block):
@@ -315,11 +382,12 @@ class NaiveBlock(Block):
         self._append_token_ids_no_cow(token_ids)
 
     def append_token_ids(self, token_ids: List[int]) -> None:
-        """Appends the given token IDs to the block, instructing the allocator
-        to perform a copy-on-write if necessary.
+        """Appends the given token IDs to the block and performs a 
+        copy-on-write if necessary.
 
         Args:
-            token_ids (List[int]): The token IDs to be appended to the block.
+            token_ids (Optional[List[int]]): The token IDs to be appended 
+                to the block.
         """
         self._append_token_ids_no_cow(token_ids)
 
@@ -328,7 +396,16 @@ class NaiveBlock(Block):
                 self._cow_target))
 
     def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
-        assert self.num_empty_slots >= len(token_ids)
+        """Appends the given token IDs to the block
+
+        Args:
+            token_ids (List[int]): The token IDs to be appended to the block.
+        """
+        if len(token_ids) == 0:
+            return
+
+        assert len(token_ids) <= self.num_empty_slots
+
         self._token_ids.extend(token_ids)
 
     @property
@@ -361,12 +438,17 @@ class NaiveBlock(Block):
 
     @property
     def num_empty_slots(self) -> int:
-        return self._block_size - len(self._token_ids)
+        return self._block_size - len(self.token_ids)
 
     @property
     def token_ids(self) -> List[int]:
         return self._token_ids
 
+    @property
+    def num_tokens_total(self) -> int:
+        raise NotImplementedError(
+            "num_tokens_total is not used for naive block")
+
     @property
     def block_size(self) -> int:
         return self._block_size
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 2df7d74e4..f272e23ee 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,13 +1,13 @@
 """Token blocks."""
 
-from itertools import takewhile
 from os.path import commonprefix
 from typing import Dict, FrozenSet, Iterable, List, Optional, Tuple
 
 from vllm.core.block.common import (CopyOnWriteTracker,
                                     get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
-from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
+                                         NaiveBlockAllocator)
 from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
 from vllm.utils import cdiv
 
@@ -19,6 +19,30 @@ PrefixHash = int
 _DEFAULT_LAST_ACCESSED_TIME = -1
 
 
+class BlockTracker:
+    """Used to track the status of a block inside the prefix caching allocator
+    """
+    __slots__ = ("active", "last_accessed", "computed")
+
+    def reset(self):
+        self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+        self.computed: bool = False
+
+    def __init__(self):
+        self.active: bool = False
+        self.reset()
+
+    def enable(self):
+        assert not self.active
+        self.active = True
+        self.reset()
+
+    def disable(self):
+        assert self.active
+        self.active = False
+        self.reset()
+
+
 class PrefixCachingBlockAllocator(BlockAllocator):
     """A block allocator that implements prefix caching.
 
@@ -41,12 +65,26 @@ class PrefixCachingBlockAllocator(BlockAllocator):
         block_ids: Optional[Iterable[int]] = None,
         eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
     ):
+        if block_ids is None:
+            block_ids = range(num_blocks)
+
+        self._block_size = block_size
+
         # A mapping of prefix hash to block index. All blocks which have a
         # prefix hash will be in this dict, even if they have refcount 0.
         self._cached_blocks: Dict[PrefixHash, BlockId] = {}
 
-        # A mapping of blockId to Block to track those cached blocks
-        self._blocks: Dict[BlockId, Block] = {}
+        # Used to track status of each physical block id
+        self._block_tracker: Dict[BlockId, BlockTracker] = {}
+        for block_id in block_ids:
+            self._block_tracker[block_id] = BlockTracker()
+
+        # Pre-allocate "num_blocks * extra_factor" block objects.
+        # The "* extra_factor" is a buffer to allow more block objects
+        # than physical blocks
+        extra_factor = 4
+        self._block_pool = BlockPool(self._block_size, self._create_block,
+                                     self, num_blocks * extra_factor)
 
         # An allocator for blocks that do not have prefix hashes.
         self._hashless_allocator = NaiveBlockAllocator(
@@ -54,10 +92,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
             num_blocks=num_blocks,
             block_size=block_size,
             block_ids=block_ids,
+            block_pool=self._block_pool,  # Share block pool here
         )
 
-        self._block_size = block_size
-
         # Evitor used to maintain how we want to handle those computed blocks
         # if we find memory pressure is high.
         self.evictor: Evictor = make_evictor(eviction_policy)
@@ -68,9 +105,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
         self._refcounter = self._hashless_allocator.refcounter
 
         self._cow_tracker = CopyOnWriteTracker(
-            refcounter=self._refcounter.as_readonly(),
-            allocator=self,
-        )
+            refcounter=self._refcounter.as_readonly())
 
     # Implements Block.Factory.
     def _create_block(
@@ -90,14 +125,14 @@ class PrefixCachingBlockAllocator(BlockAllocator):
             token_ids=token_ids,
             block_size=block_size,
             block_id=block_id,
-            prefix_caching_allocator=allocator,
+            allocator=allocator,
             computed=computed,
         )
 
-    def allocate_immutable(self,
-                           prev_block: Optional[Block],
-                           token_ids: List[int],
-                           device: Optional[Device] = None) -> Block:
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Optional[Device] = None) -> Block:
         """Allocates an immutable block with the given token IDs, reusing cached
         blocks if possible.
 
@@ -111,29 +146,41 @@ class PrefixCachingBlockAllocator(BlockAllocator):
         assert device is None
         assert_prefix_caching_block_or_none(prev_block)
 
-        block = self._create_block(
-            prev_block=prev_block,
-            token_ids=token_ids,
-            block_size=self._block_size,
-            allocator=self,
-        )
+        # First, try to create a block that points to cached data
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=token_ids,
+                                            block_size=self._block_size,
+                                            physical_block_id=None)
         assert block.content_hash is not None
 
         cached_block_id = self._cached_blocks.get(block.content_hash, None)
         if cached_block_id is not None:
             block.block_id = cached_block_id
-            self._incr_refcount_cached_block(block, block.block_id)
+            self._incr_refcount_cached_block(block)
             return block
+        self._block_pool.free_block(block)
 
-        block = self.allocate_mutable(prev_block)
+        # No cached block => Allocate a new block
+        block = self.allocate_mutable_block(prev_block)
         block.append_token_ids(token_ids)
-        assert block.content_hash is not None
-
         return block
 
-    def allocate_mutable(self,
-                         prev_block: Optional[Block],
-                         device: Optional[Device] = None) -> Block:
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Optional[Device] = None) -> List[Block]:
+        blocks = []
+        for token_ids in block_token_ids:
+            prev_block = self.allocate_immutable_block(prev_block=prev_block,
+                                                       token_ids=token_ids,
+                                                       device=device)
+            blocks.append(prev_block)
+        return blocks
+
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Optional[Device] = None) -> Block:
         """Allocates a mutable block. If there are no free blocks, this will
         evict unused cached blocks.
 
@@ -147,116 +194,154 @@ class PrefixCachingBlockAllocator(BlockAllocator):
         assert device is None
         assert_prefix_caching_block_or_none(prev_block)
 
-        try:
-            block = self._hashless_allocator.allocate_mutable(
-                prev_block=prev_block)
-
-            assert block.block_id not in self._blocks
-            assert block.block_id is not None
-            self._blocks[block.block_id] = block
-            return block
-        except BlockAllocator.NoFreeBlocksError:
-            # We must check the unused cached blocks before raising OOM.
-            pass
-
-        # If the evictor has blocks available for eviction, evict a block
-        # and return it.
-        if self.evictor.num_blocks > 0:
-            # here we get an evicted block, which is only added
-            # into evictor if its ref counter is 0
-            # and since its content would be changed, we need
-            # to remove it from _cached_blocks's tracking list
-            block_id, content_hash_to_evict = self.evictor.evict()
-
-            _block_id = self._cached_blocks[content_hash_to_evict]
-            assert self._refcounter.get(_block_id) == 0
-            assert _block_id == block_id
-
-            self._cached_blocks.pop(content_hash_to_evict)
-
-            self._refcounter.incr(block_id)
-
-            # Now this block is pop from evictor and ready to write
-            # with new content which most probably different with
-            # original content. So need to tell worker to recompute
-            # its kvcache
-            block = self._create_block(
-                prev_block=prev_block,
-                token_ids=[],
-                block_size=self._block_size,
-                allocator=self,
-                block_id=block_id,
-                computed=False,
-            )
-            assert block.content_hash is None
-
-            assert block.block_id not in self._blocks
-            assert block.block_id is not None
-            self._blocks[block.block_id] = block
-            return block
-
-        # No block available in hashless allocator, nor in unused cache blocks.
-        raise BlockAllocator.NoFreeBlocksError()
+        block_id = self._allocate_block_id()
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=[],
+                                            block_size=self._block_size,
+                                            physical_block_id=block_id)
+        assert not block.computed
+        assert block.content_hash is None
+        return block
 
-    def _incr_refcount_cached_block(self, block: Block,
-                                    block_id: BlockId) -> None:
-        # now _incr_refcount_cached_block comes from two place
-        # allocate_immutable/promote_to_immutable_block where hit
-        # _cached_blocks hash key.
-        # In both cases, it means that already exists a already
-        # computed block which shared with block now
+    def _incr_refcount_cached_block(self, block: Block) -> None:
+        # Set this block to be "computed" since it is pointing to a
+        # cached block id (which was already computed)
         block.computed = True
 
+        block_id = block.block_id
+        assert block_id is not None
+
         refcount = self._refcounter.incr(block_id)
         if refcount == 1:
-            # if block get referred, then it shall not be in evictor
-            # and put it into _blocks for tracking
+            # In case a cached block was evicted, restore its tracking
             if block_id in self.evictor:
                 self.evictor.remove(block_id)
-            self._blocks[block_id] = block
 
-    def free(self, block: Block) -> None:
-        """Decrement the refcount of the block. If the decremented refcount is
-        zero, store the block in the freelist.
+            self._track_block_id(block_id, computed=True)
 
-        If the block has a content hash (meaning it is immutable), then we will
-        keep the block around in case future allocations require it.
-        """
-        assert (block.block_id
-                is not None), "freeing unallocated block is undefined"
+    def _decr_refcount_cached_block(self, block: Block) -> None:
+        # Ensure this is immutable/cached block
+        assert block.content_hash is not None
+
+        block_id = block.block_id
+        assert block_id is not None
+
+        refcount = self._refcounter.decr(block_id)
+        if refcount > 0:
+            block.block_id = None
+            return
+        else:
+            assert refcount == 0
 
-        self._free_block_id_for_block(block.block_id, block)
+        # No longer used
+        assert block.content_hash in self._cached_blocks
+
+        # Add the cached block to the evictor
+        # (This keeps the cached block around so it can be reused)
+        self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
+                         self._block_tracker[block_id].last_accessed)
+
+        # Stop tracking the block
+        self._untrack_block_id(block_id)
 
         block.block_id = None
 
-    def _free_block_id_for_block(self, block_id: BlockId,
-                                 block: Block) -> None:
-        assert isinstance(block, PrefixCachingBlock)
-
-        # if we comes from promote_to_immutable_block, it means that
-        # block.content_hash is never None.
-        # However we need to release the same content block, so that
-        # physical block could get reused.
-        if block.block_id != block_id or block.content_hash is None:
-            refcount = self._refcounter.get(block_id)
-            # We have fork case where block would get more than one ref,
-            # so we cannot free it from tracking if ref cnt large than 1
-            assert block.block_id is not None
-            refcount = self._refcounter.get(block.block_id)
-            if refcount == 1:
-                del self._blocks[block.block_id]
-
-            return self._hashless_allocator.free(block)
+    def _decr_refcount_hashless_block(self, block: Block) -> None:
+        block_id = block.block_id
+        assert block_id is not None
 
-        refcount = self._refcounter.decr(block_id)
+        # We may have a fork case where block is shared,
+        # in which case, we cannot remove it from tracking
+        refcount = self._refcounter.get(block_id)
+        if refcount == 1:
+            self._untrack_block_id(block_id)
 
-        # If no longer used, add the block to the evictor.
-        if refcount == 0:
-            assert block.content_hash in self._cached_blocks
-            assert block.block_id is not None
-            del self._blocks[block.block_id]
-            self.evictor.add(block.block_id, block.content_hash,
-                             block.num_tokens_total, block.last_accessed)
+        # Decrement refcount of the block_id, but do not free the block object
+        # itself (will be handled by the caller)
+        self._hashless_allocator.free(block, keep_block_object=True)
+
+    def _allocate_block_id(self) -> BlockId:
+        """First tries to allocate a block id from the hashless allocator,
+        and if there are no blocks, then tries to evict an unused cached block.
+        """
+        hashless_block_id = self._maybe_allocate_hashless_block_id()
+        if hashless_block_id is not None:
+            return hashless_block_id
+
+        evicted_block_id = self._maybe_allocate_evicted_block_id()
+        if evicted_block_id is not None:
+            return evicted_block_id
+
+        # No block available in hashless allocator, nor in unused cache blocks.
+        raise BlockAllocator.NoFreeBlocksError()
+
+    def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
+        try:
+            # Allocate mutable block and extract its block_id
+            block = self._hashless_allocator.allocate_mutable_block(
+                prev_block=None)
+            block_id = block.block_id
+            self._block_pool.free_block(block)
+
+            self._track_block_id(block_id, computed=False)
+            return block_id
+        except BlockAllocator.NoFreeBlocksError:
+            return None
+
+    def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
+        if self.evictor.num_blocks == 0:
+            return None
+
+        # Here we get an evicted block, which is only added
+        # into evictor if its ref counter is 0
+        # and since its content would be changed, we need
+        # to remove it from _cached_blocks's tracking list
+        block_id, content_hash_to_evict = self.evictor.evict()
+
+        # Sanity checks
+        assert content_hash_to_evict in self._cached_blocks
+        _block_id = self._cached_blocks[content_hash_to_evict]
+        assert self._refcounter.get(_block_id) == 0
+        assert _block_id == block_id
+
+        self._cached_blocks.pop(content_hash_to_evict)
+
+        self._refcounter.incr(block_id)
+        self._track_block_id(block_id, computed=False)
+
+        return block_id
+
+    def _free_block_id(self, block: Block) -> None:
+        """Decrements the refcount of the block. The block may be in two 
+        possible states: (1) immutable/cached or (2) mutable/hashless. 
+        In the first case, the refcount is decremented directly and the block
+        may be possibly added to the evictor. In other case, hashless 
+        allocator free(..) with keep_block_object=True is called to only free
+        the block id (since the block object may be reused by the caller)
+        """
+        block_id = block.block_id
+        assert block_id is not None, "Freeing unallocated block is undefined"
+
+        if block.content_hash is not None:
+            # Immutable: This type of block is always cached, and we want to
+            # keep it in the evictor for future reuse
+            self._decr_refcount_cached_block(block)
+        else:
+            # Mutable: This type of block is not cached, so we release it
+            # directly to the hashless allocator
+            self._decr_refcount_hashless_block(block)
+
+        assert block.block_id is None
+
+    def free(self, block: Block, keep_block_object: bool = False) -> None:
+        """Release the block (look at free_block_id(..) docs)
+        """
+        # Release the physical block index
+        self._free_block_id(block)
+
+        # Release the block object to the pool
+        if not keep_block_object:
+            self._block_pool.free_block(block)
 
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
@@ -274,17 +359,20 @@ class PrefixCachingBlockAllocator(BlockAllocator):
         forked_blocks: List[Block] = []
         prev_block = None
         for block in source_blocks:
-            refcount = self._refcounter.incr(block.block_id)
-            assert refcount != 1, "can't fork free'd block"
-
-            forked_blocks.append(
-                self._create_block(
-                    prev_block=prev_block,
-                    token_ids=block.token_ids,
-                    block_id=block.block_id,
-                    block_size=self._block_size,
-                    allocator=self,
-                ))
+            block_id = block.block_id
+            assert block_id is not None
+
+            refcount = self._refcounter.incr(block_id)
+            assert refcount != 1, "can't fork free'd block_id = {}".format(
+                block_id)
+
+            forked_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block.token_ids,
+                block_size=self._block_size,
+                physical_block_id=block_id)
+
+            forked_blocks.append(forked_block)
             prev_block = forked_blocks[-1]
 
         return forked_blocks
@@ -329,7 +417,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
 
         Note that if we already have a cached block with the same content, we
         will replace the newly-promoted block's mapping with the existing cached
-        block.
+        block id.
 
         Args:
             block: The mutable block to be promoted.
@@ -338,23 +426,30 @@ class PrefixCachingBlockAllocator(BlockAllocator):
             BlockId: Either the original block index, or the block index of
                 the previously cached block matching the same content.
         """
+        # Ensure block can be promoted
         assert block.content_hash is not None
         assert block.block_id is not None
         assert self._refcounter.get(block.block_id) > 0
 
-        # If the content hash does not have a corresponding cached block,
-        # set this block as the cached block.
         if block.content_hash not in self._cached_blocks:
+            # No cached content hash => Set this block as cached
+            # (Note that this block is not computed yet =>
+            #  Will be computed after free())
             self._cached_blocks[block.content_hash] = block.block_id
-        else:
-            self._free_block_id_for_block(
-                self._cached_blocks[block.content_hash], block)
-            self._incr_refcount_cached_block(
-                block, self._cached_blocks[block.content_hash])
+            return block.block_id
 
-        return self._cached_blocks[block.content_hash]
+        # Reuse the cached content hash
+        self._decr_refcount_hashless_block(block)
+        block.block_id = self._cached_blocks[block.content_hash]
 
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+        # Increment refcount of the cached block and (possibly) restore
+        # it from the evictor.
+        # Note that in this case, the block is marked as computed
+        self._incr_refcount_cached_block(block)
+
+        return block.block_id
+
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
         """Performs a copy-on-write operation on the given block if it is not
         appendable.
 
@@ -362,11 +457,22 @@ class PrefixCachingBlockAllocator(BlockAllocator):
             block (Block): The block to check for copy-on-write.
 
         Returns:
-            Optional[BlockId]: The block index of the new block if a copy-on
-                -write operation was performed, or the original block index if
+            BlockId: The block index of the new block if a copy-on-write 
+                operation was performed, or the original block index if
                 no copy-on-write was necessary.
         """
-        return self._cow_tracker.cow_block_if_not_appendable(block)
+        src_block_id = block.block_id
+        assert src_block_id is not None
+
+        if self._cow_tracker.is_appendable(block):
+            return src_block_id
+
+        self._free_block_id(block)
+        trg_block_id = self._allocate_block_id()
+
+        self._cow_tracker.record_cow(src_block_id, trg_block_id)
+
+        return trg_block_id
 
     def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
         """Returns the copy-on-write source->destination mapping and clears it.
@@ -386,8 +492,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
         """
 
         for block_id in block_ids:
-            if block_id in self._blocks:
-                self._blocks[block_id].last_accessed = now
+            if self._block_tracker[block_id].active:
+                self._block_tracker[block_id].last_accessed = now
             elif block_id in self.evictor:
                 self.evictor.update(block_id, now)
             else:
@@ -395,25 +501,46 @@ class PrefixCachingBlockAllocator(BlockAllocator):
                     "Mark block as accessed which is not belonged to GPU")
 
     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        """Mark blocks as computed, used in prefix caching."""
+        raise NotImplementedError("Marking as computed is incremental")
 
-        for block_id in block_ids:
-            if block_id in self._blocks:
-                # only those full block is valid for prefix caching
-                if self._blocks[block_id].is_full:
-                    self._blocks[block_id].computed = True
-            elif block_id not in self.evictor:
-                raise ValueError(f"Mark {block_id=} as computed which "
-                                 "is not belonged to GPU")
+    def _track_block_id(self, block_id: Optional[BlockId],
+                        computed: bool) -> None:
+        assert block_id is not None
+        self._block_tracker[block_id].enable()
+        self._block_tracker[block_id].computed = computed
+
+    def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
+        assert block_id is not None
+        self._block_tracker[block_id].disable()
 
     def block_is_computed(self, block_id: int) -> bool:
-        if block_id in self._blocks:
-            return self._blocks[block_id].computed
+        if self._block_tracker[block_id].active:
+            return self._block_tracker[block_id].computed
         else:
             return block_id in self.evictor
 
+    def get_computed_block_ids(self,
+                               prev_computed_block_ids: List[int],
+                               block_ids: List[int],
+                               skip_last_block_id: bool = True) -> List[int]:
+        prev_prefix_size = len(prev_computed_block_ids)
+        cur_size = len(block_ids)
+        if skip_last_block_id:
+            cur_size -= 1
+
+        # Sanity checks
+        assert cur_size >= 0
+        assert prev_prefix_size <= cur_size
+
+        ret = prev_computed_block_ids
+        for i in range(prev_prefix_size, cur_size):
+            block_id = block_ids[i]
+            if self.block_is_computed(block_id):
+                ret.append(block_id)
+        return ret
+
     def get_common_computed_block_ids(
-            self, seq_block_ids: List[List[int]]) -> List[int]:
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         """Return the block ids that are common for a given sequence group.
 
         Only those blocks that are immutable and already be marked
@@ -424,14 +551,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
         # prompt is cached. This would cause erroneous behavior in model
         # runner.
 
-        ids_list = [
-            list(
-                takewhile(lambda block_id: self.block_is_computed(block_id),
-                          seq[:-1])) for seq in seq_block_ids
-        ]
         # It returns a list of int although type annotation says list of string.
         return commonprefix([
-            ids for ids in ids_list  # type: ignore
+            ids for ids in computed_seq_block_ids  # type: ignore
             if ids != []
         ])
 
@@ -473,10 +595,10 @@ class PrefixCachingBlockAllocator(BlockAllocator):
             blocks: List of blocks to be swapped out.
         """
         for block in blocks:
-            self.free(block)
+            self._free_block_id(block)
 
     def swap_in(self, blocks: List[Block]) -> None:
-        """Execute the swap int actions. Change the block id from 
+        """Execute the swap in actions. Change the block id from 
         old allocator to current allocator for each block to finish 
         the block table update. 
 
@@ -484,13 +606,22 @@ class PrefixCachingBlockAllocator(BlockAllocator):
             blocks: List of blocks to be swapped in.
         """
         for block in blocks:
+            # Here we allocate either immutable or mutable block and then
+            # extract its block_id. Note that the block object is released
+            # and the block_id is assigned to "block" to allow reusing the
+            # existing "block" object
             if block.is_full:
-                alloc = self.allocate_immutable(block.prev_block,
-                                                block.token_ids)
+                tmp_block = self.allocate_immutable_block(
+                    prev_block=block.prev_block, token_ids=block.token_ids)
             else:
-                alloc = self.allocate_mutable(block.prev_block)
-                alloc.append_token_ids(block.token_ids)
-            block.block_id = alloc.block_id
+                tmp_block = self.allocate_mutable_block(
+                    prev_block=block.prev_block)
+                tmp_block.append_token_ids(block.token_ids)
+
+            block_id = tmp_block.block_id
+            self._block_pool.free_block(tmp_block)
+
+            block.block_id = block_id  # Assign block_id
 
 
 class PrefixCachingBlock(Block):
@@ -507,7 +638,7 @@ class PrefixCachingBlock(Block):
         token_ids (List[int]): The initial token IDs to be stored in the block.
         block_size (int): The maximum number of token IDs that can be stored in
             the block.
-        prefix_caching_allocator (BlockAllocator): The prefix
+        allocator (BlockAllocator): The prefix
             caching block allocator associated with this block.
         block_id (Optional[int], optional): The physical block index
             of this block. Defaults to None.
@@ -518,31 +649,55 @@ class PrefixCachingBlock(Block):
         prev_block: Optional[Block],
         token_ids: List[int],
         block_size: int,
-        prefix_caching_allocator: BlockAllocator,
+        allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
     ):
-        assert isinstance(prefix_caching_allocator,
-                          PrefixCachingBlockAllocator), (
-                              "Currently this class is only tested with "
-                              "PrefixCachingBlockAllocator.")
+        assert isinstance(allocator, PrefixCachingBlockAllocator), (
+            "Currently this class is only tested with "
+            "PrefixCachingBlockAllocator. Got instead allocator = {}".format(
+                allocator))
         assert_prefix_caching_block_or_none(prev_block)
 
         self._prev_block = prev_block
         self._cached_content_hash: Optional[int] = None
-        self._cached_num_tokens_total: Optional[int] = None
-        self._prefix_caching_allocator = prefix_caching_allocator
+        self._cached_num_tokens_total: int = 0
+        self._allocator = allocator
         self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
         self._computed = computed
 
-        self._block = NaiveBlock(
-            prev_block=prev_block,
-            token_ids=token_ids,
-            block_size=block_size,
-            block_id=block_id,
-            allocator=prefix_caching_allocator,
-            _cow_target=self,
-        )
+        # On the first time, we create the block object, and next we only
+        # reinitialize it
+        if hasattr(self, "_block"):
+            self._block.__init__(  # type: ignore[has-type]
+                prev_block=prev_block,
+                token_ids=token_ids,
+                block_size=block_size,
+                block_id=block_id,
+                allocator=self._allocator)
+        else:
+            self._block = NaiveBlock(prev_block=prev_block,
+                                     token_ids=token_ids,
+                                     block_size=block_size,
+                                     block_id=block_id,
+                                     allocator=self._allocator)
+
+        self._update_num_tokens_total()
+
+    def _update_num_tokens_total(self):
+        """Incrementally computes the number of tokens that there is
+        till the current block (included)
+        """
+        res = 0
+
+        # Add all previous blocks
+        if self._prev_block is not None:
+            res += self._prev_block.num_tokens_total
+
+        # Add current block
+        res += len(self.token_ids)
+
+        self._cached_num_tokens_total = res
 
     @property
     def computed(self) -> bool:
@@ -564,22 +719,28 @@ class PrefixCachingBlock(Block):
         """Appends the given token IDs to the block and registers the block as
         immutable if the block becomes full.
 
-        Internally, the naive block handles CoW.
-
         Args:
             token_ids (List[int]): The token IDs to be appended to the block.
         """
-        assert token_ids
+        # Ensure this is mutable block (not promoted)
+        assert self.content_hash is None
+        assert not self.computed
+
+        if len(token_ids) == 0:
+            return
 
-        # naive block handles CoW.
+        # Ensure there are input tokens
+        assert token_ids, "Got token_ids = {}".format(token_ids)
+
+        # Naive block handles CoW.
         self._block.append_token_ids(token_ids)
+        self._update_num_tokens_total()
 
         # If the content hash is present, then the block can be made immutable.
         # Register ourselves with the allocator, potentially replacing the
         # physical block index.
         if self.content_hash is not None:
-            self.block_id = (self._prefix_caching_allocator.
-                             promote_to_immutable_block(self))
+            self.block_id = self._allocator.promote_to_immutable_block(self)
 
     @property
     def block_id(self) -> Optional[int]:
@@ -599,23 +760,6 @@ class PrefixCachingBlock(Block):
 
     @property
     def num_tokens_total(self) -> int:
-        """return the total tokens so far.
-
-        Here we iterate the block chain till to the first block, while
-        cache the result in local to prevent repeated computations.
-        """
-        if self._cached_num_tokens_total is not None:
-            return self._cached_num_tokens_total
-
-        _block: Optional[Block] = self
-        self._cached_num_tokens_total = 0
-
-        # TODO: current implement here take O(N^2), we expect future
-        # we have O(1) here
-        while _block is not None:
-            self._cached_num_tokens_total += len(_block.token_ids)
-            _block = _block.prev_block
-
         return self._cached_num_tokens_total
 
     @property
@@ -638,7 +782,6 @@ class PrefixCachingBlock(Block):
         For the content-based hash to be defined, the current block must be
         full.
         """
-
         # If the hash is already computed, return it.
         if self._cached_content_hash is not None:
             return self._cached_content_hash
@@ -688,7 +831,129 @@ class PrefixCachingBlock(Block):
         return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
 
 
+class ComputedBlocksTracker:
+    """Handles caching of per-sequence computed block ids. 
+        When a sequence appears for the first time, it traverses all of the 
+        blocks and detects the prefix of blocks that is computed. On the
+        subsequent times, it only traverses the new blocks that were added 
+        and updates the already recorded prefix of blocks with the newly 
+        computed blocks.
+
+        To avoid redundant traversals, the algorithm also detects when there
+        is a "gap" in the computed prefix. For example, if we have blocks =
+        [1,2,3,4,5], and we have detected [1,2,3] as the computed prefix, then
+        we won't try to add more computed blocks to [1,2,3] in this sequence
+        iteration, and will add more computed blocks only after the sequence is
+        freed and reused again.
+
+        Note that currently, for a given sequence, we also skip the last 
+        block id for caching purposes, to avoid caching of a full sequence
+    """
+
+    def __init__(self, allocator):
+        self._allocator = allocator
+        self._cached_computed_seq_blocks: Dict[int, Tuple[List[int],
+                                                          bool]] = {}
+
+    def add_seq(self, seq_id: int) -> None:
+        """Start tracking seq_id
+        """
+        assert seq_id not in self._cached_computed_seq_blocks
+        self._cached_computed_seq_blocks[seq_id] = ([], False)
+
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking seq_id
+        """
+        assert seq_id in self._cached_computed_seq_blocks
+        del self._cached_computed_seq_blocks[seq_id]
+
+    def get_cached_computed_blocks_and_update(
+            self, seq_id: int, block_ids: List[int]) -> List[int]:
+        """ Look at the class documentation for details
+        """
+        # Ensure seq_id is already tracked
+        assert seq_id in self._cached_computed_seq_blocks
+
+        # Get cached data (may be empty on the first time)
+        prev_computed_block_ids, has_gap = self._cached_computed_seq_blocks[
+            seq_id]
+
+        if has_gap:
+            # When gap is detected, we do not add more computed blocks at this
+            # sequence iteration
+            return prev_computed_block_ids
+
+        # We do not consider the last block id for caching purposes.
+        num_cur_blocks = len(block_ids) - 1
+        assert num_cur_blocks >= 0
+
+        if len(prev_computed_block_ids) >= num_cur_blocks:
+            # Cache HIT
+            assert len(prev_computed_block_ids) == num_cur_blocks
+            return prev_computed_block_ids
+
+        # If here, then we may possibly add more computed blocks. As a result,
+        # traverse the additional blocks after prev_computed_block_ids to
+        # detect more computed blocks and add them.
+
+        # Incremental init for seq_id => Look only at the new blocks
+        computed_block_ids = self._allocator.get_computed_block_ids(  # noqa: E501
+            prev_computed_block_ids,
+            block_ids,
+            skip_last_block_id=
+            True,  # We skip last block id to avoid caching of full seq
+        )
+
+        # Detect if there is a "gap"
+        has_gap = len(computed_block_ids) < num_cur_blocks
+
+        # Record
+        self._cached_computed_seq_blocks[seq_id] = (computed_block_ids,
+                                                    has_gap)
+
+        return computed_block_ids
+
+
+class LastAccessBlocksTracker:
+    """Manages the last access time of the tracked sequences, in order to allow
+    an efficient update of allocator's block last access times
+    """
+
+    def __init__(self, allocator):
+        self._allocator = allocator
+        self._seq_last_access: Dict[int, Optional[float]] = {}
+
+    def add_seq(self, seq_id: int) -> None:
+        """Start tracking seq_id
+        """
+        assert seq_id not in self._seq_last_access
+        self._seq_last_access[seq_id] = None
+
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking seq_id
+        """
+        assert seq_id in self._seq_last_access
+        del self._seq_last_access[seq_id]
+
+    def update_last_access(self, seq_id: int, time: float) -> None:
+        assert seq_id in self._seq_last_access
+        self._seq_last_access[seq_id] = time
+
+    def update_seq_blocks_last_access(self, seq_id: int,
+                                      block_ids: List[int]) -> None:
+        assert seq_id in self._seq_last_access
+
+        ts = self._seq_last_access[seq_id]
+
+        if ts is None:
+            # No last access was recorded, no need to update.
+            return
+
+        self._allocator.mark_blocks_as_accessed(block_ids, ts)
+
+
 def assert_prefix_caching_block_or_none(block: Optional[Block]):
     if block is None:
         return
-    assert isinstance(block, PrefixCachingBlock)
+    assert isinstance(block,
+                      PrefixCachingBlock), "Got block = {}".format(block)
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 309775237..6a6eebc39 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -7,6 +7,8 @@ from typing import Tuple
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block
+from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
+                                                  LastAccessBlocksTracker)
 from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
@@ -100,6 +102,11 @@ class BlockSpaceManagerV2(BlockSpaceManager):
         self.block_tables: Dict[SeqId, BlockTable] = {}
         self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
 
+        self._computed_blocks_tracker = ComputedBlocksTracker(
+            self.block_allocator)
+        self._last_access_blocks_tracker = LastAccessBlocksTracker(
+            self.block_allocator)
+
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
@@ -157,10 +164,18 @@ class BlockSpaceManagerV2(BlockSpaceManager):
         block_table: BlockTable = self._allocate_sequence(seq)
         self.block_tables[seq.seq_id] = block_table
 
+        # Track seq
+        self._computed_blocks_tracker.add_seq(seq.seq_id)
+        self._last_access_blocks_tracker.add_seq(seq.seq_id)
+
         # Assign the block table for each sequence.
         for seq in waiting_seqs[1:]:
             self.block_tables[seq.seq_id] = block_table.fork()
 
+            # Track seq
+            self._computed_blocks_tracker.add_seq(seq.seq_id)
+            self._last_access_blocks_tracker.add_seq(seq.seq_id)
+
         # Allocate cross-attention block table for encoder sequence
         #
         # NOTE: Here we assume that all sequences in the group have the same
@@ -224,11 +239,23 @@ class BlockSpaceManagerV2(BlockSpaceManager):
         return new_cows
 
     def free(self, seq: Sequence) -> None:
-        if seq.seq_id not in self.block_tables:
+        seq_id = seq.seq_id
+
+        if seq_id not in self.block_tables:
             # Already freed or haven't been scheduled yet.
             return
-        self.block_tables[seq.seq_id].free()
-        del self.block_tables[seq.seq_id]
+
+        # Update seq block ids with the latest access time
+        self._last_access_blocks_tracker.update_seq_blocks_last_access(
+            seq_id, self.block_tables[seq.seq_id].physical_block_ids)
+
+        # Untrack seq
+        self._last_access_blocks_tracker.remove_seq(seq_id)
+        self._computed_blocks_tracker.remove_seq(seq_id)
+
+        # Free table/blocks
+        self.block_tables[seq_id].free()
+        del self.block_tables[seq_id]
 
     def free_cross(self, seq_group: SequenceGroup) -> None:
         request_id = seq_group.request_id
@@ -239,9 +266,7 @@ class BlockSpaceManagerV2(BlockSpaceManager):
         del self.cross_block_tables[request_id]
 
     def get_block_table(self, seq: Sequence) -> List[int]:
-        assert seq.seq_id in self.block_tables
         block_ids = self.block_tables[seq.seq_id].physical_block_ids
-        assert all(b is not None for b in block_ids)
         return block_ids  # type: ignore
 
     def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
@@ -252,20 +277,14 @@ class BlockSpaceManagerV2(BlockSpaceManager):
         return block_ids  # type: ignore
 
     def access_all_blocks_in_seq(self, seq: Sequence, now: float):
-        # Update the last accessed time of all the blocks accessed
-        # in this step.
-        # And the accessed time is only useful for prefix caching now,
-        # as it support internal evictor policy for which cached
-        # block could be refilled, to keep cached content could be reused
-        # at max extend.
         if self.enable_caching:
-            block_table = self.block_tables[seq.seq_id]
-            block_ids: List[Optional[int]] = []
-            for block_id in block_table.physical_block_ids:
-                block_ids.append(block_id)
-            self.block_allocator.mark_blocks_as_accessed(
-                block_ids,  # type: ignore
-                now)
+            # Record the latest access time for the sequence. The actual update
+            # of the block ids is deferred to the sequence free(..) call, since
+            # only during freeing of block ids, the blocks are actually added to
+            # the evictor (which is when the most updated time is required)
+            # (This avoids expensive calls to mark_blocks_as_accessed(..))
+            self._last_access_blocks_tracker.update_last_access(
+                seq.seq_id, now)
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         # The only need for mark block as computed is for prefix caching,
@@ -285,17 +304,26 @@ class BlockSpaceManagerV2(BlockSpaceManager):
         This method determines which blocks can be safely skipped for all
         sequences in the sequence group.
         """
-        seq_block_ids = [
-            self.block_tables[seq.seq_id].physical_block_ids for seq in seqs
-        ]
+        computed_seq_block_ids = []
+        for seq in seqs:
+            computed_seq_block_ids.append(
+                self._computed_blocks_tracker.
+                get_cached_computed_blocks_and_update(
+                    seq.seq_id,
+                    self.block_tables[seq.seq_id].physical_block_ids))
+
         # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
         return self.block_allocator.get_common_computed_block_ids(
-            seq_block_ids)  # type: ignore
+            computed_seq_block_ids)  # type: ignore
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
+        # Track child seq
+        self._computed_blocks_tracker.add_seq(child_seq.seq_id)
+        self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
+
     def can_swap_in(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> AllocStatus:
         """Returns the AllocStatus for the given sequence_group 
@@ -323,19 +351,31 @@ class BlockSpaceManagerV2(BlockSpaceManager):
             List[Tuple[int, int]]: The mapping of swapping block from CPU 
                 to GPU.
         """
-        blocks = self._get_blocks_for_swap(seq_group, SequenceStatus.SWAPPED)
-        current_swap_mapping = self.block_allocator.swap(
-            blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU)
-
-        block_number_mapping = {
-            self.block_allocator.get_physical_block_id(Device.CPU,
-                                                       cpu_block_id):
-            self.block_allocator.get_physical_block_id(Device.GPU,
-                                                       gpu_block_id)
-            for cpu_block_id, gpu_block_id in current_swap_mapping.items()
-        }
-        # convert to list of tuples once here
-        return list(block_number_mapping.items())
+        physical_block_id_mapping = []
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            blocks = self.block_tables[seq.seq_id].blocks
+            if len(blocks) == 0:
+                continue
+
+            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
+                                                         src_device=Device.CPU,
+                                                         dst_device=Device.GPU)
+
+            # Refresh the block ids of the table (post-swap)
+            self.block_tables[seq.seq_id].update(blocks)
+
+            seq_physical_block_id_mapping = {
+                self.block_allocator.get_physical_block_id(
+                    Device.CPU, cpu_block_id):
+                self.block_allocator.get_physical_block_id(
+                    Device.GPU, gpu_block_id)
+                for cpu_block_id, gpu_block_id in seq_swap_mapping.items()
+            }
+
+            physical_block_id_mapping.extend(
+                list(seq_physical_block_id_mapping.items()))
+
+        return physical_block_id_mapping
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         """Returns whether we can swap out the given sequence_group 
@@ -355,7 +395,7 @@ class BlockSpaceManagerV2(BlockSpaceManager):
             return True
         return False
 
-    def swap_out(self, sequence_group: SequenceGroup) -> List[Tuple[int, int]]:
+    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         """Returns the block id mapping (from GPU to CPU) generated by
         swapping out the given sequence_group with num_lookahead_slots.
 
@@ -366,19 +406,31 @@ class BlockSpaceManagerV2(BlockSpaceManager):
             List[Tuple[int, int]]: The mapping of swapping block from 
                 GPU to CPU.
         """
-        blocks = self._get_blocks_for_swap(sequence_group,
-                                           SequenceStatus.RUNNING)
-        current_swap_mapping = self.block_allocator.swap(
-            blocks=blocks, source_device=Device.GPU, dest_device=Device.CPU)
-        block_number_mapping = {
-            self.block_allocator.get_physical_block_id(Device.GPU,
-                                                       gpu_block_id):
-            self.block_allocator.get_physical_block_id(Device.CPU,
-                                                       cpu_block_id)
-            for gpu_block_id, cpu_block_id in current_swap_mapping.items()
-        }
-        # convert to list of tuples once here
-        return list(block_number_mapping.items())
+        physical_block_id_mapping = []
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            blocks = self.block_tables[seq.seq_id].blocks
+            if len(blocks) == 0:
+                continue
+
+            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
+                                                         src_device=Device.GPU,
+                                                         dst_device=Device.CPU)
+
+            # Refresh the block ids of the table (post-swap)
+            self.block_tables[seq.seq_id].update(blocks)
+
+            seq_physical_block_id_mapping = {
+                self.block_allocator.get_physical_block_id(
+                    Device.GPU, gpu_block_id):
+                self.block_allocator.get_physical_block_id(
+                    Device.CPU, cpu_block_id)
+                for gpu_block_id, cpu_block_id in seq_swap_mapping.items()
+            }
+
+            physical_block_id_mapping.extend(
+                list(seq_physical_block_id_mapping.items()))
+
+        return physical_block_id_mapping
 
     def get_num_free_gpu_blocks(self) -> int:
         return self.block_allocator.get_num_free_blocks(Device.GPU)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5886ebc24..c13b17471 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -177,7 +177,8 @@ class LLMEngine:
             "enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
             "decoding_config=%r, observability_config=%r, "
-            "seed=%d, served_model_name=%s)",
+            "seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
+            "enable_prefix_caching=%s)",
             VLLM_VERSION,
             model_config.model,
             speculative_config,
@@ -204,6 +205,8 @@ class LLMEngine:
             observability_config,
             model_config.seed,
             model_config.served_model_name,
+            scheduler_config.use_v2_block_manager,
+            cache_config.enable_prefix_caching,
         )
         # TODO(woosuk): Print more configs in debug mode.
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 8741893c9..1bd095655 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -345,7 +345,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     out_logprobs = prompt_logprobs
                     output_text = prompt_text
                 elif request.echo and request.max_tokens > 0:
-                    token_ids = prompt_token_ids + output.token_ids
+                    token_ids = prompt_token_ids + list(output.token_ids)
                     out_logprobs = (prompt_logprobs + output.logprobs
                                     if request.logprobs is not None else None)
                     output_text = prompt_text + output.text
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index f95de56f3..ad5fb1317 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -427,8 +427,8 @@ class SamplingTensors:
                 if seq_group.do_sample:
                     for seq_id in seq_ids:
                         seq_data = seq_group.seq_data[seq_id]
-                        prompt_tokens.append(seq_data.prompt_token_ids)
-                        output_tokens.append(seq_data.output_token_ids)
+                        prompt_tokens.append(list(seq_data.prompt_token_ids))
+                        output_tokens.append(list(seq_data.output_token_ids))
 
         sampling_tensors = SamplingTensors.from_lists(
             temperatures, top_ps, top_ks, min_ps, presence_penalties,
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 49f526b5f..4cb7f06bd 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,6 +1,6 @@
 import time
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 
 from vllm.lora.request import LoRARequest
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
@@ -28,7 +28,7 @@ class CompletionOutput:
 
     index: int
     text: str
-    token_ids: List[int]
+    token_ids: Tuple[int, ...]
     cumulative_logprob: float
     logprobs: Optional[SampleLogprobs]
     finish_reason: Optional[str] = None
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 22cb26dc0..21c558d44 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -116,41 +116,66 @@ class SequenceData:
         prompt_token_ids: List[int],
         output_token_ids: Optional[List[int]] = None,
     ) -> None:
-        if output_token_ids is None:
-            output_token_ids = []
+        self._prompt_token_ids: List[int] = list(prompt_token_ids)
+        self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
+        self._output_token_ids: List[int] = (
+            list(output_token_ids) if output_token_ids is not None else [])
 
-        self.prompt_token_ids = prompt_token_ids
-        self._prompt_token_ids_tuple = tuple(prompt_token_ids)
-        self.output_token_ids = output_token_ids
         self.cumulative_logprob = 0.0
         # The number of tokens that are computed (that run against the model).
         self._num_computed_tokens = 0
         self._stage: SequenceStage = SequenceStage.PREFILL
 
+        self._update_cached_all_tokens()
+
+    def _update_cached_all_tokens(self):
+        self._cached_all_token_ids: List[int] = (self._prompt_token_ids +
+                                                 self._output_token_ids)
+
+    @property
+    def prompt_token_ids(self) -> Tuple[int, ...]:
+        return self._prompt_token_ids_tuple
+
+    @prompt_token_ids.setter
+    def prompt_token_ids(self, new_prompt_token_ids) -> None:
+        self._prompt_token_ids = list(new_prompt_token_ids)
+        self._prompt_token_ids_tuple = tuple(new_prompt_token_ids)
+        self._update_cached_all_tokens()
+
+    @property
+    def output_token_ids(self) -> Tuple[int, ...]:
+        return tuple(self._output_token_ids)
+
+    @output_token_ids.setter
+    def output_token_ids(self, new_output_token_ids) -> None:
+        self._output_token_ids = list(new_output_token_ids)
+        self._update_cached_all_tokens()
+
     def append_token_id(self, token_id: int, logprob: float) -> None:
-        self.output_token_ids.append(token_id)
+        self._output_token_ids.append(token_id)
+        self._cached_all_token_ids.append(token_id)
         self.cumulative_logprob += logprob
 
     def get_len(self) -> int:
-        return len(self.output_token_ids) + len(self.prompt_token_ids)
+        return len(self._output_token_ids) + len(self._prompt_token_ids)
 
     def get_prompt_len(self) -> int:
-        return len(self.prompt_token_ids)
+        return len(self._prompt_token_ids)
 
     def get_output_len(self) -> int:
-        return len(self.output_token_ids)
+        return len(self._output_token_ids)
 
     def get_token_ids(self) -> List[int]:
-        return self.prompt_token_ids + self.output_token_ids
+        return self._cached_all_token_ids
 
     def get_prefix_token_ids(
             self, num_tokens: int
     ) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]:
         """Get prefix tokens, and make the return value hashable"""
-        prompt_length = len(self.prompt_token_ids)
+        prompt_length = self.get_prompt_len()
         if num_tokens > prompt_length:
             return (self._prompt_token_ids_tuple,
-                    tuple(self.output_token_ids[:num_tokens - prompt_length]))
+                    tuple(self._output_token_ids[:num_tokens - prompt_length]))
         else:
             return (self._prompt_token_ids_tuple[:num_tokens], None)
 
@@ -183,14 +208,14 @@ class SequenceData:
         return self.get_len() - self.get_num_computed_tokens()
 
     def get_last_token_id(self) -> int:
-        if not self.output_token_ids:
-            return self.prompt_token_ids[-1]
-        return self.output_token_ids[-1]
+        if not self._output_token_ids:
+            return self._prompt_token_ids[-1]
+        return self._output_token_ids[-1]
 
-    def get_prompt_token_ids(self) -> List[int]:
+    def get_prompt_token_ids(self) -> Tuple[int, ...]:
         return self.prompt_token_ids
 
-    def get_output_token_ids(self) -> List[int]:
+    def get_output_token_ids(self) -> Tuple[int, ...]:
         return self.output_token_ids
 
     @property
@@ -199,8 +224,8 @@ class SequenceData:
 
     def __repr__(self) -> str:
         return (f"SequenceData("
-                f"prompt_token_ids={self.prompt_token_ids}, "
-                f"output_token_ids={self.output_token_ids}, "
+                f"prompt_token_ids={self._prompt_token_ids}, "
+                f"output_token_ids={self._output_token_ids}, "
                 f"cumulative_logprob={self.cumulative_logprob})")
 
 
@@ -306,14 +331,14 @@ class Sequence:
     def get_token_ids(self) -> List[int]:
         return self.data.get_token_ids()
 
-    def get_prompt_token_ids(self) -> List[int]:
+    def get_prompt_token_ids(self) -> Tuple[int, ...]:
         return self.data.get_prompt_token_ids()
 
     def get_last_token_id(self) -> int:
         return self.data.get_last_token_id()
 
-    def get_output_token_ids(self) -> List[int]:
-        return self.data.output_token_ids
+    def get_output_token_ids(self) -> Tuple[int, ...]:
+        return self.data.get_output_token_ids()
 
     def get_cumulative_logprob(self) -> float:
         return self.data.cumulative_logprob
-- 
GitLab


From 2c37540aa6af89f0ece874d831dff3bf62edf486 Mon Sep 17 00:00:00 2001
From: danieljannai21 <100521221+danieljannai21@users.noreply.github.com>
Date: Tue, 2 Jul 2024 09:01:57 +0300
Subject: [PATCH 235/376] [Frontend] Add template related params to request
 (#5709)

---
 requirements-common.txt                 |  2 +-
 vllm/entrypoints/openai/protocol.py     | 21 +++++++++++++++++++++
 vllm/entrypoints/openai/serving_chat.py |  8 ++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 636f85343..765568b03 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.42.0  # Required for Gemma 2.
+transformers >= 4.42.0  # Required for Gemma 2 and for additional chat template parameters.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index d1568cb3a..7f97e534e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -190,6 +190,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "special tokens so this should be set to False (as is the "
             "default)."),
     )
+    documents: Optional[List[Dict[str, str]]] = Field(
+        default=None,
+        description=
+        ("A list of dicts representing documents that will be accessible to "
+         "the model if it is performing RAG (retrieval-augmented generation)."
+         " If the template does not support RAG, this argument will have no "
+         "effect. We recommend that each document should be a dict containing "
+         "\"title\" and \"text\" keys."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "If this is not passed, the model's default chat template will be "
+            "used instead."),
+    )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
+    )
     include_stop_str_in_output: Optional[bool] = Field(
         default=False,
         description=(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 744e1d945..4a960fd7e 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -218,10 +218,18 @@ class OpenAIServingChat(OpenAIServing):
                 conversation.extend(chat_parsed_result.messages)
                 image_futures.extend(chat_parsed_result.image_futures)
 
+            tool_dicts = None if request.tools is None else [
+                tool.model_dump() for tool in request.tools
+            ]
+
             prompt = self.tokenizer.apply_chat_template(
                 conversation=conversation,
                 tokenize=False,
                 add_generation_prompt=request.add_generation_prompt,
+                tools=tool_dicts,
+                documents=request.documents,
+                chat_template=request.chat_template,
+                **(request.chat_template_kwargs or {}),
             )
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)
-- 
GitLab


From 98d6682cd1f27fa48bf915d3fd3e1eb1ee3014c4 Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Tue, 2 Jul 2024 00:57:09 -0700
Subject: [PATCH 236/376] [VLM] Remove `image_input_type` from VLM config
 (#5852)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 .buildkite/download-images.sh                 |   4 -
 docs/requirements-docs.txt                    |  16 +--
 .../dev/multimodal/multimodal_index.rst       |   8 +-
 docs/source/models/vlm.rst                    |  11 +-
 examples/llava_example.py                     |  56 ++------
 examples/llava_next_example.py                |  61 +++++----
 examples/openai_vision_api_client.py          |   1 -
 examples/phi3v_example.py                     |   6 +-
 tests/conftest.py                             |  38 ++----
 tests/entrypoints/openai/test_vision.py       |   2 -
 tests/models/test_llava.py                    |  22 ++-
 tests/models/test_llava_next.py               |  23 ++--
 tests/models/test_phi3v.py                    |  21 ++-
 tests/multimodal/test_mapper.py               |  40 +-----
 tests/spec_decode/e2e/conftest.py             |   4 +-
 tests/tokenization/test_image_processor.py    |  20 ---
 vllm/config.py                                |  34 +----
 vllm/engine/arg_utils.py                      |  56 +-------
 vllm/entrypoints/openai/api_server.py         |   9 --
 vllm/entrypoints/openai/serving_chat.py       |  65 ++++-----
 vllm/inputs/data.py                           |  11 +-
 vllm/inputs/registry.py                       |   7 +-
 vllm/model_executor/model_loader/loader.py    |   5 +-
 vllm/model_executor/models/clip.py            |  20 +--
 vllm/model_executor/models/llava.py           | 102 +++-----------
 vllm/model_executor/models/llava_next.py      | 126 ++++++------------
 vllm/model_executor/models/phi3v.py           |  25 ++--
 vllm/multimodal/__init__.py                   |   8 +-
 vllm/multimodal/base.py                       |  53 ++++----
 vllm/multimodal/image.py                      |  93 ++-----------
 vllm/multimodal/registry.py                   |  96 +++++++------
 vllm/multimodal/utils.py                      |  13 +-
 vllm/sequence.py                              |  10 +-
 vllm/transformers_utils/image_processor.py    |   4 -
 vllm/worker/model_runner.py                   |   2 +-
 35 files changed, 325 insertions(+), 747 deletions(-)
 delete mode 100644 tests/tokenization/test_image_processor.py

diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
index 389a12956..360a7584b 100644
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@@ -8,10 +8,6 @@ set -o pipefail
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
 
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index ed5698162..db076b2d8 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,13 +1,5 @@
-sphinx == 6.2.1
-sphinx-book-theme == 1.0.1
-sphinx-copybutton == 0.5.2
-myst-parser == 2.0.0
+sphinx==6.2.1
+sphinx-book-theme==1.0.1
+sphinx-copybutton==0.5.2
+myst-parser==2.0.0
 sphinx-argparse
-
-# packages to install to build the documentation
-pydantic
--f https://download.pytorch.org/whl/cpu
-torch
-py-cpuinfo
-transformers
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index f6fdfc1de..4d5fb3246 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -9,8 +9,10 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 which allows you to pass in multi-modal input alongside text and token prompts.
 
 By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+you must decorate the model class with :meth:`InputRegistry.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`,
+as well as :meth:`MULTIMODAL_REGISTRY.register_input_mapper <MultiModalRegistry.register_input_mapper>` for each modality type to support.
+
+# TODO: Add more instructions on how to do that once embeddings is in.
 
 Module Contents
 +++++++++++++++
@@ -29,7 +31,7 @@ Registry
 Base Classes
 ------------
 
-.. autoclass:: vllm.multimodal.MultiModalData
+.. autoclass:: vllm.multimodal.MultiModalDataDict
     :members:
     :show-inheritance:
 
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 1837dd2aa..053f5b860 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -36,7 +36,6 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
         image_feature_size=576,
@@ -49,7 +48,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
+* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
+
+.. note::
+
+   ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
+    :class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
 
 .. code-block:: python
 
@@ -61,7 +65,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {"image": image},
     })
 
     for o in outputs:
@@ -93,7 +97,6 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
 
     python -m vllm.entrypoints.openai.api_server \
         --model llava-hf/llava-1.5-7b-hf \
-        --image-input-type pixel_values \
         --image-token-id 32000 \
         --image-input-shape 1,3,336,336 \
         --image-feature-size 576 \
diff --git a/examples/llava_example.py b/examples/llava_example.py
index 980d7bf9f..7f3d84f99 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -1,38 +1,32 @@
-import argparse
 import os
 import subprocess
 
-import torch
 from PIL import Image
 
 from vllm import LLM
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 
 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
 # You can use `.buildkite/download-images.sh` to download them
 
 
-def run_llava_pixel_values(*, disable_image_processor: bool = False):
+def run_llava():
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
         image_feature_size=576,
-        disable_image_processor=disable_image_processor,
     )
 
     prompt = "<image>" * 576 + (
         "\nUSER: What is the content of this image?\nASSISTANT:")
 
-    if disable_image_processor:
-        image = torch.load("images/stop_sign_pixel_values.pt")
-    else:
-        image = Image.open("images/stop_sign.jpg")
+    image = Image.open("images/stop_sign.jpg")
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {
+            "image": image
+        },
     })
 
     for o in outputs:
@@ -40,45 +34,11 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
         print(generated_text)
 
 
-def run_llava_image_features():
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="image_features",
-        image_token_id=32000,
-        image_input_shape="1,576,1024",
-        image_feature_size=576,
-    )
-
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
-
-    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": ImageFeatureData(image),
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-def main(args):
-    if args.type == "pixel_values":
-        run_llava_pixel_values()
-    else:
-        run_llava_image_features()
+def main():
+    run_llava()
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Demo on Llava")
-    parser.add_argument("--type",
-                        type=str,
-                        choices=["pixel_values", "image_features"],
-                        default="pixel_values",
-                        help="image input type")
-    args = parser.parse_args()
     # Download from s3
     s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
     local_directory = "images"
@@ -95,4 +55,4 @@ if __name__ == "__main__":
         local_directory,
         "--no-sign-request",
     ])
-    main(args)
+    main()
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
index e90a86abe..3c39590e7 100644
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -4,35 +4,44 @@ import requests
 from PIL import Image
 
 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData
 
 # Dynamic image input is currently not supported and therefore
 # a fixed image input shape and its corresponding feature size is required.
 # See https://github.com/vllm-project/vllm/pull/4199 for the complete
 # configuration matrix.
 
-llm = LLM(
-    model="llava-hf/llava-v1.6-mistral-7b-hf",
-    image_input_type="pixel_values",
-    image_token_id=32000,
-    image_input_shape="1,3,336,336",
-    image_feature_size=1176,
-)
-
-prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
-url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
-image = Image.open(BytesIO(requests.get(url).content))
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
-
-outputs = llm.generate(
-    {
-        "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
-    },
-    sampling_params=sampling_params)
-
-generated_text = ""
-for o in outputs:
-    generated_text += o.outputs[0].text
-
-print(f"LLM output:{generated_text}")
+
+def run_llava_next():
+    llm = LLM(
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        image_token_id=32000,
+        image_input_shape="1,3,336,336",
+        image_feature_size=1176,
+    )
+
+    prompt = "[INST] " + "<image>" * 1176 + (
+        "\nWhat is shown in this image? [/INST]")
+    url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+    image = Image.open(BytesIO(requests.get(url).content))
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=100)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            }
+        },
+        sampling_params=sampling_params)
+
+    generated_text = ""
+    for o in outputs:
+        generated_text += o.outputs[0].text
+
+    print(f"LLM output:{generated_text}")
+
+
+if __name__ == "__main__":
+    run_llava_next()
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index 26f2aa651..fcda1345f 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -3,7 +3,6 @@
 Launch the vLLM server with the following command:
 python -m vllm.entrypoints.openai.api_server \
     --model llava-hf/llava-1.5-7b-hf \
-    --image-input-type pixel_values \
     --image-token-id 32000 \
     --image-input-shape 1,3,336,336 \
     --image-feature-size 576 \
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index f0b9b0e4f..7d6c58d7f 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -4,7 +4,6 @@ import subprocess
 from PIL import Image
 
 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData
 
 
 def run_phi3v():
@@ -17,7 +16,6 @@ def run_phi3v():
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
-        image_input_type="pixel_values",
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
@@ -35,7 +33,9 @@ def run_phi3v():
     outputs = llm.generate(
         {
             "prompt": prompt,
-            "multi_modal_data": ImagePixelData(image),
+            "multi_modal_data": {
+                "image": image
+            },
         },
         sampling_params=sampling_params)
     for o in outputs:
diff --git a/tests/conftest.py b/tests/conftest.py
index ac802d03b..c3bd78263 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,19 +17,17 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
                           AutoTokenizer, BatchEncoding)
 
 from vllm import LLM, SamplingParams
-from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.config import TokenizerPoolConfig
 from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
+from vllm.sequence import SampleLogprobs
+from vllm.utils import cuda_device_count_stateless, is_cpu
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
-else:
     # it will call torch.cuda.device_count()
-    MultiModalData = None
-from vllm.sequence import SampleLogprobs
-from vllm.utils import cuda_device_count_stateless, is_cpu
+    from vllm.multimodal import MultiModalDataDict
 
 logger = init_logger(__name__)
 
@@ -51,14 +49,6 @@ def _read_prompts(filename: str) -> List[str]:
 class ImageAsset:
     name: Literal["stop_sign", "cherry_blossom"]
 
-    @cached_property
-    def pixel_values(self) -> torch.Tensor:
-        return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt")
-
-    @cached_property
-    def image_features(self) -> torch.Tensor:
-        return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt")
-
     @cached_property
     def pil_image(self) -> Image.Image:
         return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
@@ -66,20 +56,8 @@ class ImageAsset:
     def for_hf(self) -> Image.Image:
         return self.pil_image
 
-    def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
-        # don't put this import at the top level
-        # it will call torch.cuda.device_count()
-        from vllm.multimodal.image import ImageFeatureData  # noqa: F401
-        from vllm.multimodal.image import ImagePixelData
-        image_input_type = vision_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if image_input_type == ImageInputType.IMAGE_FEATURES:
-            return ImageFeatureData(self.image_features)
-        if image_input_type == ImageInputType.PIXEL_VALUES:
-            return ImagePixelData(self.pil_image)
-
-        raise NotImplementedError
+    def for_vllm(self) -> Dict[str, Any]:
+        return {"image": self.pil_image}
 
 
 class _ImageAssetPrompts(TypedDict):
@@ -453,7 +431,7 @@ class VllmRunner:
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List["MultiModalDataDict"]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -502,7 +480,7 @@ class VllmRunner:
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List["MultiModalDataDict"]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index dbaaa349a..a7f7fdae8 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -39,8 +39,6 @@ def server():
         "--max-model-len",
         "4096",
         "--enforce-eager",
-        "--image-input-type",
-        "pixel_values",
         "--image-token-id",
         "32000",
         "--image-input-shape",
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index b4220dc59..c6313c52e 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -25,17 +25,11 @@ def iter_llava_configs(model_name: str):
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32000,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(image_feature_size=f,
+                                    image_token_id=32000,
+                                    image_input_shape=input_shape))
 
 
 model_and_vl_config = [
@@ -81,8 +75,8 @@ def run_test(
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
@@ -104,7 +98,7 @@ def run_test(
         # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
         # we must put it inside the vllm_runner context manager
         # i.e. after creating vLLM instance.
-        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+        vllm_images = [asset.for_vllm() for asset in image_assets]
 
         vllm_image_prompts = [
             p.replace("<image>", "<image>" * vlm_config.image_feature_size)
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 940d5035e..e9babba13 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -33,16 +33,13 @@ def iter_llava_next_configs(model_name: str):
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32000,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(
+                   image_feature_size=f,
+                   image_token_id=32000,
+                   image_input_shape=input_shape,
+               ))
 
 
 model_and_vl_config = [
@@ -85,14 +82,14 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
     hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+    vllm_images = [asset.for_vllm() for asset in image_assets]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index ba71763f9..917bdbf94 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -27,16 +27,11 @@ def iter_phi3v_configs(model_name: str):
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32044,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(image_feature_size=f,
+                                    image_token_id=32044,
+                                    image_input_shape=input_shape))
 
 
 model_and_vl_config = [
@@ -89,8 +84,8 @@ def run_test(
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
@@ -113,7 +108,7 @@ def run_test(
         # we must put it inside the vllm_runner context manager
         # i.e. after creating vLLM instance.
 
-        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+        vllm_images = [asset.for_vllm() for asset in image_assets]
 
         vllm_image_prompts = [
             p.replace("<|image_1|>",
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 2c05b0edb..bdbbd9abf 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -2,9 +2,8 @@ import numpy as np
 import pytest
 from transformers import CLIPImageProcessor, LlavaNextImageProcessor
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData
 
 from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
 
@@ -12,7 +11,6 @@ from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_clip_image_processor(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
     assert isinstance(hf_processor, CLIPImageProcessor)
@@ -25,14 +23,6 @@ def test_clip_image_processor(image_assets, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=32000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=576,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ),
     )
 
     for asset in image_assets:
@@ -42,7 +32,7 @@ def test_clip_image_processor(image_assets, dtype):
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -60,7 +50,6 @@ def test_clip_image_processor(image_assets, dtype):
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_llava_next_image_processor(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
     assert isinstance(hf_processor, LlavaNextImageProcessor)
@@ -73,14 +62,6 @@ def test_llava_next_image_processor(image_assets, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=64000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=2928,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ),
     )
 
     for asset in image_assets:
@@ -90,7 +71,7 @@ def test_llava_next_image_processor(image_assets, dtype):
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -107,7 +88,6 @@ def test_llava_next_image_processor(image_assets, dtype):
 @pytest.mark.parametrize("dtype", ["float"])
 def test_image_pixel_types(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     model_config = ModelConfig(
         model=MODEL_NAME,
@@ -117,23 +97,15 @@ def test_image_pixel_types(image_assets, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=32000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=576,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ))
-
+    )
     for asset in image_assets:
         image_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
         )
         tensor_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            ImagePixelData(asset.pixel_values),
+            {"image": asset.pil_image},
         )
 
         assert image_result.keys() == tensor_result.keys()
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 60dfe33f2..8ad8e9cb8 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -11,7 +11,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
-from vllm.multimodal import MultiModalData
+from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
@@ -91,7 +91,7 @@ class AsyncLLM:
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
+        multi_modal_data: Optional[MultiModalDataDict] = None,
     ) -> List[RequestOutput]:
 
         if prompts is None:
diff --git a/tests/tokenization/test_image_processor.py b/tests/tokenization/test_image_processor.py
deleted file mode 100644
index 5ba232336..000000000
--- a/tests/tokenization/test_image_processor.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pytest
-from transformers.image_processing_utils import BaseImageProcessor
-
-from vllm.transformers_utils.image_processor import get_image_processor
-
-IMAGE_PROCESSOR_NAMES = [
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-34b-hf",
-]
-
-
-@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
-def test_image_processor_revision(processor_name: str):
-    # Assume that "main" branch always exists
-    image_processor = get_image_processor(processor_name, revision="main")
-    assert isinstance(image_processor, BaseImageProcessor)
-
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
-        get_image_processor(processor_name, revision="never")
diff --git a/vllm/config.py b/vllm/config.py
index 9854f1750..b919b212d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1250,28 +1250,11 @@ class LoRAConfig:
             raise ValueError("LoRA is not supported with chunked prefill yet.")
 
 
+# TODO: To be replaced by MultiModalConfig.
 @dataclass
 class VisionLanguageConfig:
     """Configs the input data format and how models should run for
     vision language models."""
-
-    class ImageInputType(enum.Enum):
-        """Image input type into the vision language model.
-
-        An image roughly goes through the following transformation:
-        Raw image --> pixel values --> image features --> image embeddings.
-
-        The difference between different image input types is where the
-        image encoder (pixel values --> image features) is run.
-        Different image input types also correspond to different tensor shapes.
-
-        For example, for Llava, PIXEL_VALUES: (1, 3, 336, 336).
-        IMAGE_FEATURES: (1, 576, 1024).
-        """
-        PIXEL_VALUES = enum.auto()
-        IMAGE_FEATURES = enum.auto()
-
-    image_input_type: ImageInputType
     # The input id corresponding to image token.
     image_token_id: int
     # Used for running `run_prefill_max_token`.
@@ -1279,19 +1262,6 @@ class VisionLanguageConfig:
     # worst case scenario (biggest supported resolution).
     image_input_shape: tuple
     image_feature_size: int
-    # The image processor to load from HuggingFace
-    image_processor: Optional[str]
-    image_processor_revision: Optional[str]
-
-    @classmethod
-    def get_image_input_enum_type(cls, value: str) -> ImageInputType:
-        """Get the image input type from a string."""
-        try:
-            return cls.ImageInputType[value.upper()]
-        except KeyError as e:
-            raise ValueError(f"{value} is not a valid choice. "
-                             f"Expecting to choose from "
-                             f"{[x.name for x in cls.ImageInputType]}.") from e
 
     #TODO(ywang96): make this a cached property once we refactor the
     # VisionLanguageConfig class.
@@ -1318,8 +1288,6 @@ class VisionLanguageConfig:
             else:
                 result[f.name] = value
 
-        result["disable_image_processor"] = self.image_processor is None
-
         return result
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d4044adfc..565b9e779 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,7 +1,6 @@
 import argparse
 import dataclasses
 import json
-import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -80,13 +79,9 @@ class EngineArgs:
     preemption_mode: Optional[str] = None
 
     # Related to Vision-language models such as llava
-    image_input_type: Optional[str] = None
     image_token_id: Optional[int] = None
     image_input_shape: Optional[str] = None
     image_feature_size: Optional[int] = None
-    image_processor: Optional[str] = None
-    image_processor_revision: Optional[str] = None
-    disable_image_processor: bool = False
 
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
@@ -114,14 +109,6 @@ class EngineArgs:
     @staticmethod
     def add_cli_args_for_vlm(
             parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
-        parser.add_argument('--image-input-type',
-                            type=nullable_str,
-                            default=None,
-                            choices=[
-                                t.name.lower()
-                                for t in VisionLanguageConfig.ImageInputType
-                            ],
-                            help=('The image input type passed into vLLM.'))
         parser.add_argument('--image-token-id',
                             type=int,
                             default=None,
@@ -137,24 +124,6 @@ class EngineArgs:
             type=int,
             default=None,
             help=('The image feature size along the context dimension.'))
-        parser.add_argument(
-            '--image-processor',
-            type=str,
-            default=EngineArgs.image_processor,
-            help='Name or path of the huggingface image processor to use. '
-            'If unspecified, model name or path will be used.')
-        parser.add_argument(
-            '--image-processor-revision',
-            type=str,
-            default=None,
-            help='Revision of the huggingface image processor version to use. '
-            'It can be a branch name, a tag name, or a commit id. '
-            'If unspecified, will use the default version.')
-        parser.add_argument(
-            '--disable-image-processor',
-            action='store_true',
-            help='Disables the use of image processor, even if one is defined '
-            'for the model on huggingface.')
 
         return parser
 
@@ -679,33 +648,16 @@ class EngineArgs:
             raise ValueError(
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
-        if self.image_input_type:
-            if (not self.image_token_id or not self.image_input_shape
-                    or not self.image_feature_size):
+        if self.image_token_id is not None:
+            if (not self.image_input_shape or not self.image_feature_size):
                 raise ValueError(
-                    'Specify `image_token_id`, `image_input_shape` and '
-                    '`image_feature_size` together with `image_input_type`.')
-
-            if self.image_processor is None:
-                self.image_processor = self.model
-            if self.disable_image_processor:
-                if self.image_processor != self.model:
-                    warnings.warn(
-                        "You've specified an image processor "
-                        f"({self.image_processor}) but also disabled "
-                        "it via `--disable-image-processor`.",
-                        stacklevel=2)
-
-                self.image_processor = None
+                    'Specify `image_input_shape` and '
+                    '`image_feature_size` together with `image_token_id`.')
 
             vision_language_config = VisionLanguageConfig(
-                image_input_type=VisionLanguageConfig.
-                get_image_input_enum_type(self.image_input_type),
                 image_token_id=self.image_token_id,
                 image_input_shape=str_to_int_tuple(self.image_input_shape),
                 image_feature_size=self.image_feature_size,
-                image_processor=self.image_processor,
-                image_processor_revision=self.image_processor_revision,
             )
         else:
             vision_language_config = None
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a708176c2..76879c96c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -213,15 +213,6 @@ if __name__ == "__main__":
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
 
-    # Enforce pixel values as image input type for vision language models
-    # when serving with API server
-    if engine_args.image_input_type is not None and \
-        engine_args.image_input_type.upper() != "PIXEL_VALUES":
-        raise ValueError(
-            f"Invalid image_input_type: {engine_args.image_input_type}. "
-            "Only --image-input-type 'pixel_values' is supported for serving "
-            "vision language models with the vLLM API server.")
-
     engine = AsyncLLMEngine.from_engine_args(
         engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 4a960fd7e..e5b6b7f57 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -26,7 +26,7 @@ from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import (async_get_and_parse_image,
                                    get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
@@ -47,7 +47,7 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
-    image_futures: List[Awaitable[ImagePixelData]] = field(
+    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
         default_factory=list)
 
 
@@ -103,7 +103,7 @@ class OpenAIServingChat(OpenAIServing):
         parts: Iterable[ChatCompletionContentPartParam],
     ) -> ChatMessageParseResult:
         texts: List[str] = []
-        image_futures: List[Awaitable[ImagePixelData]] = []
+        mm_futures: List[Awaitable[MultiModalDataDict]] = []
 
         vlm_config: Optional[VisionLanguageConfig] = getattr(
             self.engine.engine, "vision_language_config", None)
@@ -113,39 +113,34 @@ class OpenAIServingChat(OpenAIServing):
             part_type = part["type"]
             if part_type == "text":
                 text = cast(ChatCompletionContentPartTextParam, part)["text"]
-
                 texts.append(text)
             elif part_type == "image_url":
                 if vlm_config is None:
                     raise ValueError(
                         "'image_url' input is not supported as the loaded "
                         "model is not multimodal.")
+                assert self.tokenizer is not None
+                image_url = cast(ChatCompletionContentPartImageParam,
+                                 part)["image_url"]
 
-                elif len(image_futures) == 0:
-                    assert self.tokenizer is not None
-                    image_url = cast(ChatCompletionContentPartImageParam,
-                                     part)["image_url"]
-
-                    if image_url.get("detail", "auto") != "auto":
-                        logger.warning(
-                            "'image_url.detail' is currently not supported and "
-                            "will be ignored.")
-
-                    image_future = async_get_and_parse_image(image_url["url"])
-                    image_futures.append(image_future)
+                if image_url.get("detail", "auto") != "auto":
+                    logger.warning(
+                        "'image_url.detail' is currently not supported and "
+                        "will be ignored.")
 
-                else:
-                    raise NotImplementedError(
-                        "Multiple 'image_url' input is currently not supported."
-                    )
+                mm_future = async_get_and_parse_image(image_url["url"])
+                mm_futures.append(mm_future)
 
             else:
                 raise NotImplementedError(f"Unknown part type: {part_type}")
 
         text_prompt = "\n".join(texts)
 
-        if vlm_config is not None and len(image_futures):
+        if vlm_config is not None and len(mm_futures):
 
+            assert len(
+                mm_futures
+            ) == 1, "Multiple 'image_url' input is currently not supported."
             (image_token_prompt,
              image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
 
@@ -171,8 +166,7 @@ class OpenAIServingChat(OpenAIServing):
         else:
             messages = [ConversationMessage(role=role, content=text_prompt)]
 
-        return ChatMessageParseResult(messages=messages,
-                                      image_futures=image_futures)
+        return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
 
     def _parse_chat_message_content(
         self,
@@ -182,10 +176,10 @@ class OpenAIServingChat(OpenAIServing):
         content = message.get("content")
 
         if content is None:
-            return ChatMessageParseResult(messages=[], image_futures=[])
+            return ChatMessageParseResult(messages=[], mm_futures=[])
         if isinstance(content, str):
             messages = [ConversationMessage(role=role, content=content)]
-            return ChatMessageParseResult(messages=messages, image_futures=[])
+            return ChatMessageParseResult(messages=messages, mm_futures=[])
 
         return self._parse_chat_message_content_parts(role, content)
 
@@ -210,13 +204,13 @@ class OpenAIServingChat(OpenAIServing):
 
         try:
             conversation: List[ConversationMessage] = []
-            image_futures: List[Awaitable[ImagePixelData]] = []
+            mm_futures: List[Awaitable[MultiModalDataDict]] = []
 
             for msg in request.messages:
                 chat_parsed_result = self._parse_chat_message_content(msg)
 
                 conversation.extend(chat_parsed_result.messages)
-                image_futures.extend(chat_parsed_result.image_futures)
+                mm_futures.extend(chat_parsed_result.mm_futures)
 
             tool_dicts = None if request.tools is None else [
                 tool.model_dump() for tool in request.tools
@@ -235,15 +229,14 @@ class OpenAIServingChat(OpenAIServing):
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
 
-        # Fetch image data
-        image_data: Optional[ImagePixelData] = None
+        mm_data: Optional[MultiModalDataDict] = None
         try:
-            if len(image_futures):
-                # since we support only single image currently
-                assert len(image_futures) == 1
-                image_data = await image_futures[0]
+            if len(mm_futures):
+                # since we support only single mm data currently
+                assert len(mm_futures) == 1
+                mm_data = await mm_futures[0]
         except Exception as e:
-            logger.error("Error in loading image data: %s", e)
+            logger.error("Error in loading multi-modal data: %s", e)
             return self.create_error_response(str(e))
 
         request_id = f"cmpl-{random_uuid()}"
@@ -274,8 +267,8 @@ class OpenAIServingChat(OpenAIServing):
             "prompt": prompt_text,
             "prompt_token_ids": prompt_ids,
         }
-        if image_data is not None:
-            inputs["multi_modal_data"] = image_data
+        if mm_data is not None:
+            inputs["multi_modal_data"] = mm_data
 
         is_tracing_enabled = await self.engine.is_tracing_enabled()
         trace_headers = None
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 9b163b9cf..c6381fcc0 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
 from typing_extensions import NotRequired
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import MultiModalDataDict
 
 
 class ParsedText(TypedDict):
@@ -72,7 +72,7 @@ class TextPrompt(TypedDict):
     prompt: str
     """The input text to be tokenized before passing to the model."""
 
-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -85,7 +85,7 @@ class TokensPrompt(TypedDict):
     prompt_token_ids: List[int]
     """A list of token IDs to pass to the model."""
 
-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -103,7 +103,7 @@ class TextTokensPrompt(TypedDict):
     prompt_token_ids: List[int]
     """The token IDs of the prompt."""
 
-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -128,7 +128,6 @@ class LLMInputs(TypedDict):
     The inputs in :class:`~vllm.LLMEngine` before they are
     passed to the model executor.
     """
-
     prompt_token_ids: List[int]
     """The token IDs of the prompt."""
 
@@ -137,7 +136,7 @@ class LLMInputs(TypedDict):
     The original prompt text corresponding to the token IDs, if available.
     """
 
-    multi_modal_data: NotRequired[Optional["MultiModalData"]]
+    multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 8f4e108b8..3e2873338 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@ from .data import LLMInputs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VisionLanguageConfig
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import MultiModalDataDict
     from vllm.sequence import SequenceData
 
 logger = init_logger(__name__)
@@ -66,7 +66,8 @@ class InputContext:
 N = TypeVar("N", bound=Type[nn.Module])
 
 DummyDataFactory = Callable[[InputContext, int],
-                            Tuple["SequenceData", Optional["MultiModalData"]]]
+                            Tuple["SequenceData",
+                                  Optional["MultiModalDataDict"]]]
 """
 Create dummy data to be inputted into the model.
 
@@ -94,7 +95,7 @@ class InputRegistry:
         self,
         ctx: InputContext,
         seq_len: int,
-    ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
+    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
         """
         The default dummy data factory represents the longest possible text
         that can be inputted to the model.
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 361dc7322..b61ac7490 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -84,9 +84,8 @@ def _get_model_initialization_kwargs(
 
     if supports_vision(model_class):
         if vlm_config is None:
-            raise ValueError("Provide `image_input_type` and other vision "
-                             "related configurations through LLM entrypoint "
-                             "or engine arguments.")
+            raise ValueError("Provide vision related configurations "
+                             "through LLM entrypoint or engine arguments.")
 
         extra_kwargs["vlm_config"] = vlm_config
 
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 77fbade05..5212e2808 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -12,7 +12,6 @@ from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SequenceData
 
 
@@ -49,7 +48,7 @@ def dummy_seq_data_for_clip(
     return SequenceData(token_ids)
 
 
-def dummy_pixel_data_for_clip(
+def dummy_image_for_clip(
     hf_config: CLIPVisionConfig,
     *,
     image_width_override: Optional[int] = None,
@@ -62,22 +61,7 @@ def dummy_pixel_data_for_clip(
         height = image_height_override
 
     image = Image.new("RGB", (width, height), color=0)
-    return ImagePixelData(image)
-
-
-def dummy_feature_data_for_clip(
-    hf_config: CLIPVisionConfig,
-    *,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_clip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    values = torch.zeros((1, image_feature_size, hf_config.hidden_size),
-                         dtype=torch.float16)
-    return ImageFeatureData(values)
+    return {"image": image}
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index ba4496f9c..e0134c5c4 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -17,11 +17,10 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput
 
-from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
-                   dummy_seq_data_for_clip)
+from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsVision
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -76,17 +75,10 @@ class LlavaImagePixelInputs(TypedDict):
     """Shape: (batch_size, num_channels, height, width)"""
 
 
-class LlavaImageFeatureInputs(TypedDict):
-    type: Literal["image_features"]
-    data: torch.Tensor
-    """Shape: (batch_size, image_feature_size, hidden_size)"""
-
-
-LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
+LlavaImageInputs = LlavaImagePixelInputs
 
 
 def dummy_data_for_llava(ctx: InputContext, seq_len: int):
-    multimodal_config = ctx.get_multimodal_config()
     hf_config = ctx.get_hf_config(LlavaConfig)
     vision_config = hf_config.vision_config
 
@@ -97,22 +89,14 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int):
             image_token_id=hf_config.image_token_index,
         )
 
-        image_input_type = multimodal_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-        mm_data: MultiModalData
-        if image_input_type == ImageInputType.PIXEL_VALUES:
-            mm_data = dummy_pixel_data_for_clip(vision_config)
-        elif image_input_type == ImageInputType.IMAGE_FEATURES:
-            mm_data = dummy_feature_data_for_clip(vision_config)
-
+        mm_data = dummy_image_for_clip(vision_config)
         return seq_data, mm_data
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
-@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
 class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
@@ -126,11 +110,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
         self.config = config
         self.vlm_config = vlm_config
 
-        if self.vlm_config.image_input_type == (
-                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
-            self.vision_tower = CLIPVisionModel(config.vision_config)
-        else:
-            self.vision_tower = None
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = CLIPVisionModel(config.vision_config)
 
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
@@ -165,44 +146,18 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-        image_features = kwargs.pop("image_features", None)
-
-        expected_input_type = self.vlm_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if image_features is not None:
-                raise ValueError(
-                    "Expected pixel values but got image features")
-            if pixel_values is None:
-                return None
-
-            if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
 
-            return LlavaImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_image_data(pixel_values),
-            )
+        if pixel_values is None:
+            return None
 
-        if expected_input_type == ImageInputType.IMAGE_FEATURES:
-            if pixel_values is not None:
-                raise ValueError(
-                    "Expected image features but got pixel values")
-            if image_features is None:
-                return None
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
 
-            if not isinstance(image_features, torch.Tensor):
-                raise ValueError("Incorrect type of image features. "
-                                 f"Got type: {type(image_features)}")
-
-            return LlavaImageFeatureInputs(
-                type="image_features",
-                data=self._validate_image_data(image_features),
-            )
-
-        return None
+        return LlavaImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_image_data(pixel_values),
+        )
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -237,12 +192,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
     def _process_image_input(self,
                              image_input: LlavaImageInputs) -> torch.Tensor:
-        if image_input["type"] == "pixel_values":
-            assert self.vision_tower is not None
-            image_features = self._process_image_pixels(image_input)
-        else:
-            image_features = image_input["data"]
-
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
     def forward(
@@ -273,25 +224,10 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
 
-        This model has two modes of image inputs:
-        `PIXEL_VALUES` and `IMAGE_FEATURES`.
-
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each input image.
-                Expects a batch with shape `[1, 3, 336, 336]`.
-                (Only applicable to `PIXEL_VALUES` mode)
-            image_features: The image features for each input image outputted by
-                the vision tower before passing to the multi-modal projector.
-                Expects a batch with shape `[1, 576, 1024]`.
-                (Only applicable to `IMAGE_FEATURES` mode)
-
-        See also:
-            Each input maps to huggingface implementation, as follows:
-
-            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360
-            - `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437
         """
         image_input = self._parse_and_validate_image_input(**kwargs)
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 281431074..3c0988137 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,8 +1,8 @@
-from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict,
-                    Union)
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig, LlavaNextConfig
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
@@ -21,12 +21,11 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput
 
-from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
-                   dummy_seq_data_for_clip, get_clip_patch_grid_length)
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   get_clip_patch_grid_length)
 from .interfaces import SupportsVision
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
 
@@ -47,17 +46,7 @@ class LlavaNextImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-class LlavaNextImageFeatureInputs(TypedDict):
-    type: Literal["image_features"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, image_feature_size, hidden_size)"""
-
-    image_sizes: NotRequired[torch.Tensor]
-    """Shape: (batch_size, 2)"""
-
-
-LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
-                             LlavaNextImageFeatureInputs]
+LlavaNextImageInputs = LlavaNextImagePixelInputs
 
 
 def _get_llava_next_num_unpadded_features(
@@ -138,20 +127,11 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
             image_feature_size_override=image_feature_size,
         )
 
-        image_input_type = multimodal_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-        mm_data: MultiModalData
-        if image_input_type == ImageInputType.PIXEL_VALUES:
-            mm_data = dummy_pixel_data_for_clip(
-                vision_config,
-                image_width_override=dummy_width,
-                image_height_override=dummy_height,
-            )
-        elif image_input_type == ImageInputType.IMAGE_FEATURES:
-            mm_data = dummy_feature_data_for_clip(
-                vision_config,
-                image_feature_size_override=image_feature_size,
-            )
+        mm_data = dummy_image_for_clip(
+            vision_config,
+            image_width_override=dummy_width,
+            image_height_override=dummy_height,
+        )
 
         return seq_data, mm_data
 
@@ -159,32 +139,26 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     raise NotImplementedError(msg)
 
 
-def _pixel_mapper(ctx: InputContext,
-                  data: ImagePixelData) -> Dict[str, torch.Tensor]:
-    image = data.image
+def _pixel_mapper(ctx: InputContext, image: object) -> Dict[str, torch.Tensor]:
 
-    if isinstance(image, torch.Tensor):
-        pixel_values = image.to(ctx.model_config.dtype)
-        batch_size, _, _, h, w = pixel_values.shape
-        image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
+    if isinstance(image, Image.Image):
 
-        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
+        # Temporary patch before dynamic number of image tokens is supported
+        _, _, h, w = ctx.get_multimodal_config().image_input_shape
+        if (w, h) != (image.width, image.height):
+            logger.warning(
+                "Dynamic image shape is currently not supported. "
+                "Resizing input image to (%d, %d).", w, h)
 
-    # Temporary patch before dynamic number of image tokens is supported
-    _, _, h, w = ctx.get_multimodal_config().image_input_shape
-    if (w, h) != (image.width, image.height):
-        logger.warning(
-            "Dynamic image shape is currently not supported. "
-            "Resizing input image to (%d, %d).", w, h)
+            image = image.resize((w, h))
 
-        data.image = image.resize((w, h))
+        return MULTIMODAL_REGISTRY._get_plugin("image") \
+            ._default_input_mapper(ctx, image)
 
-    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-        ._default_input_mapper(ctx, data)
+    raise TypeError(f"Invalid type for 'image': {type(image)}")
 
 
-@MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
-@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_pixel_mapper)
+@MULTIMODAL_REGISTRY.register_image_input_mapper(_pixel_mapper)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
@@ -198,11 +172,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         self.config = config
         self.vlm_config = vlm_config
 
-        if self.vlm_config.image_input_type == (
-                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
-            self.vision_tower = CLIPVisionModel(config=config.vision_config)
-        else:
-            raise TypeError("Image features are not supported by LLaVA-NeXT")
+        self.vision_tower = CLIPVisionModel(config=config.vision_config)
 
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
@@ -255,36 +225,23 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
             self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
-        image_features = kwargs.pop("image_features", None)
-
-        expected_input_type = self.vlm_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if image_features is not None:
-                raise ValueError(
-                    "Expected pixel values but got image features")
-            if pixel_values is None:
-                return None
 
-            if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
+        if pixel_values is None or image_sizes is None:
+            return None
 
-            if not isinstance(image_sizes, torch.Tensor):
-                raise ValueError("Incorrect type of image sizes. "
-                                 f"Got type: {type(image_sizes)}")
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
 
-            return LlavaNextImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_image_pixels(pixel_values),
-                image_sizes=self._validate_image_sizes(image_sizes),
-            )
+        if not isinstance(image_sizes, torch.Tensor):
+            raise ValueError("Incorrect type of image sizes. "
+                             f"Got type: {type(image_sizes)}")
 
-        assert expected_input_type != ImageInputType.IMAGE_FEATURES, (
-            "Failed to validate this at initialization time")
-
-        return None
+        return LlavaNextImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_image_pixels(pixel_values),
+            image_sizes=self._validate_image_sizes(image_sizes),
+        )
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -391,11 +348,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
     def _process_image_input(
             self, image_input: LlavaNextImageInputs) -> torch.Tensor:
-        if image_input["type"] == "pixel_values":
-            assert self.vision_tower is not None
-            image_features = self._process_image_pixels(image_input)
-        else:
-            image_features = image_input["data"]
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
 
         patch_embeddings = self.multi_modal_projector(image_features)
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index bc3d3f0fb..a16f7f0ea 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -35,10 +35,9 @@ from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData
 from vllm.sequence import SamplerOutput
 
-from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
+from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsVision
 
 logger = init_logger(__name__)
@@ -286,7 +285,7 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
         image_token_id=32044,
         image_feature_size_override=image_feature_size,
     )
-    mm_data = dummy_pixel_data_for_clip(
+    mm_data = dummy_image_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         image_width_override=dummy_width,
         image_height_override=dummy_height,
@@ -331,8 +330,7 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
 
 
 def _image_processor(ctx: InputContext,
-                     data: ImagePixelData) -> Dict[str, torch.Tensor]:
-    image = data.image
+                     image: object) -> Dict[str, torch.Tensor]:
 
     if isinstance(image, Image.Image):
         # Temporary patch before dynamic number of image tokens is supported
@@ -343,13 +341,14 @@ def _image_processor(ctx: InputContext,
                 "Dynamic image shape is currently not supported. "
                 "Resizing input image to (%d, %d).", w, h)
 
-            data.image = image.resize((w, h))
+            image = image.resize((w, h))
 
-    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-            ._default_input_mapper(ctx, data)
+        return MULTIMODAL_REGISTRY._get_plugin("image") \
+                ._default_input_mapper(ctx, image)
+    raise TypeError(f"Invalid type for 'image': {type(image)}")
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_image_processor)
+@MULTIMODAL_REGISTRY.register_image_input_mapper(_image_processor)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
 class Phi3VForCausalLM(nn.Module, SupportsVision):
 
@@ -375,14 +374,6 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        expected_input_type = self.vlm_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if expected_input_type != ImageInputType.PIXEL_VALUES:
-            raise ValueError(
-                f"Unexpected image input type: {expected_input_type}."
-                "Phi3v only support pixel_values input currently.")
-
         if pixel_values is not None and image_sizes is not None:
             return Phi3VImagePixelInputs(type="pixel_values",
                                          data=pixel_values,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 20bd87b8c..256eadd2d 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,4 @@
-from .base import MultiModalData, MultiModalPlugin
+from .base import MultiModalDataDict, MultiModalPlugin
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -11,6 +11,8 @@ See also:
 """
 
 __all__ = [
-    "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
-    "MultiModalRegistry"
+    "MultiModalPlugin",
+    "MULTIMODAL_REGISTRY",
+    "MultiModalRegistry",
+    "MultiModalDataDict",
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index d47cdd559..558cd1175 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
-from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
-                    TypeVar)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Type,
+                    TypedDict, TypeVar, Union)
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
@@ -8,38 +8,35 @@ from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     import torch
+    from PIL import Image
     from torch import nn
 
 logger = init_logger(__name__)
 
+N = TypeVar("N", bound=Type["nn.Module"])
 
-class MultiModalData:
-    """
-    Base class that contains multi-modal data.
-
-    To add a new modality, add a new file under ``multimodal`` directory.
 
-    In this new file, subclass :class:`~MultiModalData` and
-    :class:`~MultiModalPlugin`.
+class MultiModalDataBuiltins(TypedDict, total=False):
+    image: "Image.Image"
 
-    Finally, register the new plugin to
-    :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
-    This enables models to call :meth:`MultiModalRegistry.map_input` for
-    the new modality.
-    """
-    pass
 
+MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
+"""
+A dictionary containing an item for each modality type to input.
 
-D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type["nn.Module"])
+The data belonging to each modality is converted into keyword arguments 
+to the model by the corresponding mapper. By default, the mapper of 
+the corresponding plugin with the same modality key is applied.
+"""
 
-MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[InputContext, object], Dict[str,
+                                                              "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
 
 
-class MultiModalPlugin(ABC, Generic[D]):
+class MultiModalPlugin(ABC):
     """
     Base class that defines data processing logic for a specific modality.
 
@@ -52,19 +49,18 @@ class MultiModalPlugin(ABC, Generic[D]):
 
     def __init__(self) -> None:
         self._input_mappers: Dict[Type["nn.Module"],
-                                  MultiModalInputMapper[D]] = {}
+                                  MultiModalInputMapper] = {}
 
     @abstractmethod
-    def get_data_type(self) -> Type[D]:
+    def get_data_key(self) -> str:
         """
-        Get the modality (subclass of :class:`~MultiModalData`) served by
-        this plugin.
+        Get the data key corresponding to the modality.
         """
         raise NotImplementedError
 
     @abstractmethod
     def _default_input_mapper(self, ctx: InputContext,
-                              data: D) -> Dict[str, "torch.Tensor"]:
+                              data: object) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
         :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
@@ -73,11 +69,10 @@ class MultiModalPlugin(ABC, Generic[D]):
 
     def register_input_mapper(
         self,
-        mapper: Optional[MultiModalInputMapper[D]] = None,
+        mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
         Register an input mapper to a model class.
-        
         When the model receives input data that matches the modality served by
         this plugin (see :meth:`get_data_type`), the provided function is
         invoked to transform the data into a dictionary of model inputs.
@@ -102,11 +97,13 @@ class MultiModalPlugin(ABC, Generic[D]):
         return wrapper
 
     def map_input(self, model_config: ModelConfig,
-                  data: D) -> Dict[str, "torch.Tensor"]:
+                  data: object) -> Dict[str, "torch.Tensor"]:
         """
-        Apply an input mapper to a :class:`~MultiModalData` instance passed
+        Apply an input mapper to a data passed
         to the model, transforming the data into a dictionary of model inputs.
 
+        If the data is not something that the mapper expects, throws TypeError.
+
         The model is identified by ``model_config``.
 
         TODO: Add guide [ref: PR #5276]
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index a9691575c..a0b4206bf 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Dict, Type, Union
+from typing import Dict
 
 import torch
 from PIL import Image
@@ -9,105 +9,36 @@ from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.image_processor import get_image_processor
 
-from .base import MultiModalData, MultiModalPlugin
+from .base import MultiModalPlugin
 
 logger = init_logger(__name__)
 
 cached_get_image_processor = lru_cache(get_image_processor)
 
 
-class ImagePixelData(MultiModalData):
-    """
-    The pixel data of an image. Can be one of:
+class ImagePlugin(MultiModalPlugin):
 
-    - :class:`PIL.Image.Image`: An image object. Requires that a HuggingFace
-      processor is available to the model.
-    - :class:`torch.Tensor`: The raw pixel data which is passed to the model
-      without additional pre-processing.
-    """
-
-    def __init__(self, image: Union[Image.Image, torch.Tensor]) -> None:
-        if isinstance(image, Image.Image):
-            # So that this class can be created inside the Image context manager
-            image.load()
-
-        self.image = image
-
-    def __repr__(self) -> str:
-        image = self.image
-        if isinstance(image, Image.Image):
-            return f"{type(self).__name__}(image={image})"
-
-        return (f"{type(self).__name__}(image=torch.Tensor(shape="
-                f"{image.shape}, dtype={image.dtype}))")
-
-
-class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
-
-    def get_data_type(self) -> Type[ImagePixelData]:
-        return ImagePixelData
+    def get_data_key(self) -> str:
+        return "image"
 
     def _get_hf_image_processor(self, model_config: ModelConfig):
-        vlm_config = model_config.multimodal_config
-        if vlm_config is None or vlm_config.image_processor is None:
-            return None
-
         return cached_get_image_processor(
-            vlm_config.image_processor,
-            trust_remote_code=model_config.trust_remote_code,
-            revision=vlm_config.image_processor_revision,
-        )
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code)
 
     def _default_input_mapper(self, ctx: InputContext,
-                              data: ImagePixelData) -> Dict[str, torch.Tensor]:
+                              data: object) -> Dict[str, torch.Tensor]:
         model_config = ctx.model_config
-        image = data.image
-
-        if isinstance(image, Image.Image):
+        if isinstance(data, Image.Image):
             image_processor = self._get_hf_image_processor(model_config)
             if image_processor is None:
                 raise RuntimeError("No HuggingFace processor is available"
                                    "to process the image object")
             try:
-                return image_processor.preprocess(image, return_tensors="pt") \
+                return image_processor.preprocess(data, return_tensors="pt") \
                     .to(model_config.dtype).data
             except Exception:
-                logger.error("Failed to process image (%s)", image)
+                logger.error("Failed to process image (%s)", data)
                 raise
-        elif isinstance(image, torch.Tensor):
-            pixel_values = image.to(model_config.dtype)
-
-            return {"pixel_values": pixel_values}
-
-        raise TypeError(f"Invalid image type: {type(image)}")
-
-
-class ImageFeatureData(MultiModalData):
-    """
-    The feature vector of an image, passed directly to the model.
-
-    This should be the output of the vision tower.
-    """
-
-    def __init__(self, image_features: torch.Tensor) -> None:
-        self.image_features = image_features
-
-    def __repr__(self) -> str:
-        image_features = self.image_features
-
-        return (f"{type(self).__name__}(image_features=torch.Tensor(shape="
-                f"{image_features.shape}, dtype={image_features.dtype}))")
-
-
-class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
-
-    def get_data_type(self) -> Type[ImageFeatureData]:
-        return ImageFeatureData
-
-    def _default_input_mapper(
-            self, ctx: InputContext,
-            data: ImageFeatureData) -> Dict[str, torch.Tensor]:
-        model_config = ctx.model_config
-        image_features = data.image_features.to(model_config.dtype)
 
-        return {"image_features": image_features}
+        raise TypeError(f"Invalid type for 'image': {type(data)}")
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index abc88e4f9..a09a80f89 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,18 +1,16 @@
 import functools
-from typing import Any, Optional, Sequence, Type, TypeVar
+from typing import Optional, Sequence, Type, TypeVar
 
 from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import MultiModalData, MultiModalInputMapper, MultiModalPlugin
-from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
-                    ImagePixelPlugin)
+from .base import MultiModalDataDict, MultiModalInputMapper, MultiModalPlugin
+from .image import ImagePlugin
 
 logger = init_logger(__name__)
 
-D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type[nn.Module])
 
 
@@ -20,81 +18,91 @@ class MultiModalRegistry:
     """
     A registry to dispatch data processing
     according to its modality and the target model.
+
+    The registry handles both external and internal data input.
     """
 
-    DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
+    DEFAULT_PLUGINS = (ImagePlugin(), )
 
     def __init__(
-        self,
-        *,
-        plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS,
-    ) -> None:
-        self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
+            self,
+            *,
+            plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
+        self._plugins = {p.get_data_key(): p for p in plugins}
 
-    def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
-        data_type = plugin.get_data_type()
+    def register_plugin(self, plugin: MultiModalPlugin) -> None:
+        data_type_key = plugin.get_data_key()
 
-        if data_type in self._plugins_by_data_type:
+        if data_type_key in self._plugins:
             logger.warning(
                 "A plugin is already registered for data type %s, "
-                "and will be overwritten by the new plugin %s.", data_type,
+                "and will be overwritten by the new plugin %s.", data_type_key,
                 plugin)
 
-        self._plugins_by_data_type[data_type] = plugin
+        self._plugins[data_type_key] = plugin
 
-    def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
-        for typ in data_type.mro():
-            plugin = self._plugins_by_data_type.get(typ)
-            if plugin is not None:
-                return plugin
+    def _get_plugin(self, data_type_key: str):
+        plugin = self._plugins.get(data_type_key)
+        if plugin is not None:
+            return plugin
 
-        msg = f"Unknown multi-modal data type: {data_type}"
+        msg = f"Unknown multi-modal data type: {data_type_key}"
         raise NotImplementedError(msg)
 
-    def register_input_mapper(
+    def register_image_input_mapper(
         self,
-        data_type: Type[D],
-        mapper: Optional[MultiModalInputMapper[D]] = None,
+        mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
-        Register an input mapper for a specific modality to a model class.
+        Register an input mapper for image data to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self._get_plugin_for_data_type(data_type) \
-            .register_input_mapper(mapper)
+        return self.register_input_mapper("image", mapper)
+
+    def _process_input(self, key: str, value: object,
+                       model_config: ModelConfig):
+        plugin = self._plugins.get(key)
+        if plugin:
+            return plugin.map_input(model_config, value)
+        msg = f"Unknown multi-modal data type: {key}"
+        raise NotImplementedError(msg)
 
-    def register_image_pixel_input_mapper(
+    def register_input_mapper(
         self,
-        mapper: Optional[MultiModalInputMapper[ImagePixelData]] = None,
+        data_type: str,
+        mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
-        Register an input mapper for image pixel data to a model class.
+        Register an input mapper for a specific modality to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input_mapper(ImagePixelData, mapper)
-
-    def register_image_feature_input_mapper(
-        self,
-        mapper: Optional[MultiModalInputMapper[ImageFeatureData]] = None,
-    ):
+        plugin = self._plugins.get(data_type)
+        if not plugin:
+            msg = f"Unknown multi-modal data type: {data_type}"
+            raise NotImplementedError(msg)
+        return plugin.register_input_mapper(mapper)
+
+    def register_image_input(self,
+                             mapper: Optional[MultiModalInputMapper] = None):
         """
-        Register an input mapper for image feature data to a model class.
+        Register an input mapper for image pixel data to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input_mapper(ImageFeatureData, mapper)
+        return self.register_input_mapper("image", mapper)
 
-    def map_input(self, model_config: ModelConfig, data: MultiModalData):
+    def map_input(self, model_config: ModelConfig, data: MultiModalDataDict):
         """
-        Apply an input mapper to a :class:`~MultiModalData` instance passed
-        to the model.
+        Apply an input mapper to the data passed to the model.
         
         See :meth:`MultiModalPlugin.map_input` for more details.
         """
-        return self._get_plugin_for_data_type(type(data)) \
-            .map_input(model_config, data)
+        result_list = [
+            self._process_input(k, v, model_config) for k, v in data.items()
+        ]
+        return {k: v for d in result_list for k, v in d.items()}
 
     def create_input_mapper(self, model_config: ModelConfig):
         """
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 0cf2c057f..321b51e5a 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -8,7 +8,7 @@ from PIL import Image
 
 from vllm.config import ModelConfig
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.base import MultiModalDataDict
 
 
 class ImageFetchAiohttp:
@@ -53,14 +53,10 @@ class ImageFetchAiohttp:
                 "Invalid 'image_url': A valid 'image_url' must start "
                 "with either 'data:image' or 'http'.")
 
+        image.load()
         return image
 
 
-async def async_get_and_parse_image(image_url: str) -> ImagePixelData:
-    with await ImageFetchAiohttp.fetch_image(image_url) as image:
-        return ImagePixelData(image)
-
-
 def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
     """Encode a pillow image to base64 format."""
 
@@ -91,3 +87,8 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
         raise ValueError(
             f"Unsupported model type: {config.hf_config.model_type}")
     return full_prompt
+
+
+async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
+    image = await ImageFetchAiohttp.fetch_image(image_url)
+    return {"image": image}
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 21c558d44..3e7c31b8c 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -14,7 +14,7 @@ from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
     from vllm.inputs import LLMInputs
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import MultiModalDataDict
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -280,8 +280,8 @@ class Sequence:
         return self.inputs["prompt_token_ids"]
 
     @property
-    def multi_modal_data(self) -> Optional["MultiModalData"]:
-        return self.inputs.get("multi_modal_data")
+    def multi_modal_data(self) -> "MultiModalDataDict":
+        return self.inputs.get("multi_modal_data") or {}
 
     @property
     def lora_int_id(self) -> int:
@@ -457,7 +457,7 @@ class SequenceGroup:
         return next(iter(self.seqs_dict.values())).prompt_token_ids
 
     @property
-    def multi_modal_data(self) -> Optional["MultiModalData"]:
+    def multi_modal_data(self) -> Optional["MultiModalDataDict"]:
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).multi_modal_data
@@ -639,7 +639,7 @@ class SequenceGroupMetadata:
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
-        multi_modal_data: Optional["MultiModalData"] = None,
+        multi_modal_data: Optional["MultiModalDataDict"] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
     ) -> None:
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 2bb5215d4..354dcb526 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 from transformers import AutoImageProcessor
 from transformers.image_processing_utils import BaseImageProcessor
 
@@ -12,7 +10,6 @@ def get_image_processor(
     processor_name: str,
     *args,
     trust_remote_code: bool = False,
-    revision: Optional[str] = None,
     **kwargs,
 ) -> BaseImageProcessor:
     """Gets an image processor for the given model name via HuggingFace."""
@@ -21,7 +18,6 @@ def get_image_processor(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
-            revision=revision,
             **kwargs)
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 942063677..0b20d5010 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -504,7 +504,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                      is not None else 1))
 
                 mm_data = seq_group_metadata.multi_modal_data
-                if mm_data is not None:
+                if mm_data:
                     # Process multi-modal data
                     mm_kwargs = self.multi_modal_input_mapper(mm_data)
                     for k, v in mm_kwargs.items():
-- 
GitLab


From 31354e563f077888ff1efb8ba4dad530cbdadd32 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 18:53:16 +0800
Subject: [PATCH 237/376] [Doc] Reinstate doc dependencies (#6061)

---
 docs/requirements-docs.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index db076b2d8..b35aa9ba4 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -3,3 +3,11 @@ sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==2.0.0
 sphinx-argparse
+
+# packages to install to build the documentation
+pydantic
+-f https://download.pytorch.org/whl/cpu
+torch
+py-cpuinfo
+transformers
+openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-- 
GitLab


From 15aba081f33e6d048422df6dcdb94301d08d13e6 Mon Sep 17 00:00:00 2001
From: Sirej Dua <sirejdua@gmail.com>
Date: Tue, 2 Jul 2024 07:20:29 -0700
Subject: [PATCH 238/376] [Speculative Decoding] MLPSpeculator Tensor Parallel
 support (1/2) (#6050)

Co-authored-by: Sirej Dua <sirej.dua@databricks.com>
Co-authored-by: Sirej Dua <Sirej Dua>
---
 .../e2e/test_integration_dist_tp2.py          | 36 ++++++++++++-------
 vllm/config.py                                |  6 ----
 vllm/spec_decode/spec_decode_worker.py        | 18 ++++++----
 3 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 5534b80c0..859d4234c 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -70,10 +70,6 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        # Use a small model for a fast test.
-        # Note this is repeated in the test body; to initialize a tokenizer.
-        "model": "JackFram/llama-68m",
-
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
 
@@ -88,15 +84,31 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
         # second run of the test to fail with internal NCCL error.
         "use_async": True,
     }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "speculative_draft_tensor_parallel_size": 1,
-    },
-])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs, test_llm_kwargs",
+    [
+        (
+            {
+                # Use a small model for a fast test.
+                # Note this is repeated in the test body; to initialize a
+                # tokenizer.
+                "model": "JackFram/llama-68m",
+            },
+            {
+                "speculative_model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "speculative_draft_tensor_parallel_size": 1,
+            }),
+        ({
+            "model": "ibm-granite/granite-3b-code-instruct",
+        }, {
+            "speculative_model":
+            "ibm-granite/granite-3b-code-instruct-accelerator",
+            "num_speculative_tokens": 5,
+            "speculative_draft_tensor_parallel_size": 1,
+        })
+    ])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
diff --git a/vllm/config.py b/vllm/config.py
index b919b212d..66338cb0d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -957,12 +957,6 @@ class SpeculativeConfig:
             )
 
             draft_hf_config = draft_model_config.hf_config
-            if (draft_hf_config.model_type == "mlp_speculator"
-                    and target_parallel_config.world_size != 1):
-                # MLPSpeculator TP support will be added very soon
-                raise ValueError(
-                    "Speculative decoding with mlp_speculator models does not "
-                    "yet support distributed inferencing (TP > 1).")
 
             if (num_speculative_tokens is not None
                     and hasattr(draft_hf_config, "num_lookahead_tokens")):
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index ca470bee2..43ce987de 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -113,24 +113,28 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
 
         disable_bonus_tokens = True
+
         if ngram_prompt_lookup_max > 0:
             disable_bonus_tokens = False
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
-        elif draft_worker_kwargs[
-                "model_config"].hf_config.model_type == "mlp_speculator":
-            proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
-            disable_bonus_tokens = False
         else:
             draft_parallel_config: ParallelConfig = draft_worker_kwargs[
                 'parallel_config']
             draft_tp = draft_parallel_config.tensor_parallel_size
             target_tp = scorer_worker.parallel_config.tensor_parallel_size
 
-            if draft_tp == 1:
-                draft_worker_kwargs["model_runner_cls"] = TP1DraftModelRunner
-            proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+            if draft_worker_kwargs[
+                    "model_config"].hf_config.model_type == "mlp_speculator":
+                disable_bonus_tokens = False
+                proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
+            else:
+                if draft_tp == 1:
+                    draft_worker_kwargs[
+                        "model_runner_cls"] = TP1DraftModelRunner
+                proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+
             proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
                 proposer_worker, draft_tp, target_tp)
 
-- 
GitLab


From c5832d2ae9431a1672d547c232ec46b1a9051ff0 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Tue, 2 Jul 2024 10:58:08 -0700
Subject: [PATCH 239/376] [Core] Pipeline Parallel Support (#4412)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
---
 .buildkite/test-pipeline.yaml                 |  10 +
 tests/async_engine/test_async_llm_engine.py   |  14 +-
 tests/async_engine/test_openapi_server_ray.py |   4 +-
 tests/basic_correctness/test_preemption.py    |  24 +-
 tests/distributed/test_comm_ops.py            |  20 +-
 tests/distributed/test_pipeline_parallel.py   | 149 ++++++++
 .../output_processor/test_multi_step.py       |   8 +-
 tests/entrypoints/openai/test_chat.py         |   4 +-
 tests/entrypoints/openai/test_completion.py   |   4 +-
 tests/entrypoints/openai/test_embedding.py    |   4 +-
 tests/entrypoints/openai/test_models.py       |   4 +-
 tests/entrypoints/openai/test_vision.py       |   4 +-
 tests/spec_decode/utils.py                    |   6 +-
 tests/tensorizer_loader/test_tensorizer.py    |   4 +-
 tests/utils.py                                |  16 +-
 tests/worker/test_swap.py                     |   4 +-
 vllm/config.py                                |  25 +-
 vllm/core/block_manager_v1.py                 |   3 +
 vllm/core/block_manager_v2.py                 |   3 +
 vllm/core/scheduler.py                        |  13 +-
 vllm/distributed/parallel_state.py            |  50 +--
 vllm/distributed/utils.py                     |  11 +-
 vllm/engine/async_llm_engine.py               |  79 ++++-
 vllm/engine/llm_engine.py                     |  65 +++-
 vllm/engine/output_processor/interfaces.py    |   2 +-
 vllm/engine/output_processor/multi_step.py    |   5 +-
 vllm/engine/output_processor/single_step.py   |  20 +-
 vllm/executor/distributed_gpu_executor.py     |  12 +-
 vllm/executor/executor_base.py                |  25 ++
 vllm/executor/gpu_executor.py                 |   3 +-
 vllm/executor/multiproc_gpu_executor.py       |  12 +-
 vllm/executor/ray_gpu_executor.py             |  71 +++-
 vllm/model_executor/models/arctic.py          |   3 +-
 vllm/model_executor/models/baichuan.py        |   3 +-
 vllm/model_executor/models/bloom.py           |   3 +-
 vllm/model_executor/models/chatglm.py         |   3 +-
 vllm/model_executor/models/commandr.py        |   3 +-
 vllm/model_executor/models/dbrx.py            |   3 +-
 vllm/model_executor/models/deepseek.py        |   3 +-
 vllm/model_executor/models/deepseek_v2.py     |   3 +-
 vllm/model_executor/models/falcon.py          |   3 +-
 vllm/model_executor/models/gemma.py           |   3 +-
 vllm/model_executor/models/gemma2.py          |   3 +-
 vllm/model_executor/models/gpt2.py            |  88 +++--
 vllm/model_executor/models/gpt_bigcode.py     |   3 +-
 vllm/model_executor/models/gpt_j.py           |   3 +-
 vllm/model_executor/models/gpt_neox.py        |   3 +-
 vllm/model_executor/models/internlm2.py       |   3 +-
 vllm/model_executor/models/jais.py            |   3 +-
 vllm/model_executor/models/llama.py           | 101 ++++--
 vllm/model_executor/models/llava.py           |   4 +-
 vllm/model_executor/models/llava_next.py      |   4 +-
 vllm/model_executor/models/minicpm.py         |   3 +-
 vllm/model_executor/models/mixtral.py         |   3 +-
 vllm/model_executor/models/mixtral_quant.py   |   3 +-
 vllm/model_executor/models/mpt.py             |   3 +-
 vllm/model_executor/models/olmo.py            |   3 +-
 vllm/model_executor/models/opt.py             |   3 +-
 vllm/model_executor/models/orion.py           |   3 +-
 vllm/model_executor/models/phi.py             |   3 +-
 vllm/model_executor/models/phi3_small.py      |   3 +-
 vllm/model_executor/models/phi3v.py           |  11 +-
 vllm/model_executor/models/qwen.py            |   3 +-
 vllm/model_executor/models/qwen2.py           |   3 +-
 vllm/model_executor/models/qwen2_moe.py       |   3 +-
 vllm/model_executor/models/stablelm.py        |   3 +-
 vllm/model_executor/models/starcoder2.py      |   3 +-
 vllm/model_executor/models/xverse.py          |   3 +-
 vllm/sequence.py                              |  31 ++
 vllm/spec_decode/draft_model_runner.py        |  15 +-
 vllm/worker/cache_engine.py                   |   4 +
 vllm/worker/cpu_model_runner.py               |   5 +-
 vllm/worker/cpu_worker.py                     |  38 +-
 vllm/worker/embedding_model_runner.py         |   9 +-
 vllm/worker/model_runner.py                   | 326 +++++++++++-------
 vllm/worker/model_runner_base.py              |   5 +-
 vllm/worker/neuron_model_runner.py            |   5 +-
 vllm/worker/neuron_worker.py                  |   2 +-
 vllm/worker/worker.py                         |  36 +-
 vllm/worker/worker_base.py                    |  40 ++-
 vllm/worker/xpu_model_runner.py               |   5 +-
 vllm/worker/xpu_worker.py                     |   4 +-
 82 files changed, 1100 insertions(+), 404 deletions(-)
 create mode 100644 tests/distributed/test_pipeline_parallel.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d96e3c6d1..d127278aa 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -74,6 +74,16 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
+- label: Pipeline Parallelism Test
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  commands:
+  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+
+
 - label: Engine Test
   mirror_hardwares: [amd]
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 52d3394a9..aa2b6e222 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -5,6 +5,7 @@ import pytest
 import torch
 
 from vllm import SamplingParams
+from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 
 from ..utils import wait_for_gpu_memory_to_clear
@@ -23,8 +24,11 @@ class MockEngine:
         self.add_request_calls = 0
         self.abort_request_calls = 0
         self.request_id = None
+        # Ugly, remove dependency when possible
+        self.parallel_config = ParallelConfig(1, 1, False)
 
-    async def step_async(self):
+    async def step_async(self, virtual_engine):
+        # PP size is 1, ignore virtual engine
         self.step_calls += 1
         return [RequestOutput(
             request_id=self.request_id)] if self.request_id else []
@@ -32,6 +36,9 @@ class MockEngine:
     async def process_model_inputs_async(self, *args, **kwargs):
         pass
 
+    async def stop_remote_worker_execution_loop_async(self):
+        pass
+
     def generate(self, request_id):
         self.request_id = request_id
 
@@ -41,6 +48,7 @@ class MockEngine:
     def add_request(self, **kwargs):
         del kwargs  # Unused
         self.add_request_calls += 1
+        print(f'Request calls: {self.add_request_calls}')
 
     async def add_request_async(self, **kwargs):
         self.add_request_calls += 1
@@ -53,6 +61,9 @@ class MockEngine:
     def has_unfinished_requests(self):
         return self.request_id is not None
 
+    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
+        return self.request_id is not None
+
 
 class MockAsyncLLMEngine(AsyncLLMEngine):
 
@@ -76,6 +87,7 @@ async def test_new_requests_event():
     engine.engine.generate("2")
     await asyncio.sleep(0)
     await asyncio.sleep(0)
+    await asyncio.sleep(0)
     assert engine.engine.add_request_calls == 2
     assert engine.engine.step_calls >= 2
     await asyncio.sleep(0.001)
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index 332937b87..cc05d79e5 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -4,7 +4,7 @@ import pytest
 # and debugging.
 import ray
 
-from ..utils import RemoteOpenAIServer
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
@@ -12,7 +12,7 @@ MODEL_NAME = "facebook/opt-125m"
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init()
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
     yield
     ray.shutdown()
 
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index d60cc95d7..7aed0d5e1 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -56,8 +56,8 @@ def test_chunked_prefill_recompute(
             max_num_seqs=max_num_seqs,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -91,10 +91,10 @@ def test_preemption(
             disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
         total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -147,10 +147,10 @@ def test_swap(
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_beam_search(example_prompts,
                                                        beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
         total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
 
     for i in range(len(example_prompts)):
         hf_output_ids, _ = hf_outputs[i]
@@ -214,8 +214,8 @@ def test_swap_infeasible(
             example_prompts,
             sampling_params=sampling_params,
         )
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     # Verify the request is ignored and not hang.
     assert req_outputs[0].outputs[0].finish_reason == "length"
@@ -252,8 +252,8 @@ def test_preemption_infeasible(
             sampling_params=sampling_params,
         )
 
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     # Verify the request is ignored and not hang.
     for req_output in req_outputs:
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index bf0f31df0..7302d4849 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -32,7 +32,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
         (r + 1) for r in range(tp_size)
     ]
     expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    t = all_tensors[rank]
+    t = all_tensors[rank % tp_size]
     t = tensor_model_parallel_all_reduce(t)
     assert torch.allclose(t, expected)
 
@@ -60,7 +60,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
             for r in range(tp_size)
         ]
         expected = torch.cat(all_tensors, dim=all_gather_dimension)
-        t = all_tensors[rank]
+        t = all_tensors[rank % tp_size]
         t = tensor_model_parallel_all_gather(t, all_gather_dimension)
         assert torch.allclose(t, expected)
 
@@ -91,7 +91,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
         "f": torch.tensor([], dtype=torch.float32, device="cuda"),
     }
 
-    if rank == 0:
+    if (rank % tp_size) == 0:
         broadcast_tensor_dict(test_dict, src=0)
     else:
         recv_dict = broadcast_tensor_dict(src=0)
@@ -184,3 +184,17 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
     "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
 def test_multi_process_pipeline_parallel(pp_size, test_target):
     multi_process_parallel(1, pp_size, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize("test_target", [
+    send_recv_test_worker, send_recv_tensor_dict_test_worker,
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
+def test_multi_process_tensor_parallel_pipeline_parallel(
+        tp_size, pp_size, test_target):
+    multi_process_parallel(tp_size, pp_size, test_target)
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
new file mode 100644
index 000000000..6072a2dd7
--- /dev/null
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -0,0 +1,149 @@
+import os
+
+import openai  # use the official client for correctness check
+import pytest
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
+
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+
+# downloading lora to test lora requests
+
+# any model with a chat template should work here
+MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
+EAGER_MODE = bool(int(os.getenv("EAGER_MODE", 0)))
+CHUNKED_PREFILL = bool(int(os.getenv("CHUNKED_PREFILL", 0)))
+TP_SIZE = int(os.getenv("TP_SIZE", 1))
+PP_SIZE = int(os.getenv("PP_SIZE", 1))
+
+pytestmark = pytest.mark.asyncio
+
+
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(ray_ctx):
+    args = [
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--tensor-parallel-size",
+        str(TP_SIZE),
+        "--distributed-executor-backend",
+        "ray",
+    ]
+    if CHUNKED_PREFILL:
+        args += [
+            "--enable-chunked-prefill",
+        ]
+    if EAGER_MODE:
+        args += [
+            "--enforce-eager",
+        ]
+    return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE)
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+async def test_check_models(server, client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_single_completion(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+
+
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_batch_completions(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    # test simple list
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(batch.choices) == 2
+    assert batch.choices[0].text == batch.choices[1].text
+
+    # test n = 2
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        n=2,
+        max_tokens=5,
+        temperature=0.0,
+        extra_body=dict(
+            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+            # for official client.
+            use_beam_search=True),
+    )
+    assert len(batch.choices) == 4
+    assert batch.choices[0].text != batch.choices[
+        1].text, "beam search should be different"
+    assert batch.choices[0].text == batch.choices[
+        2].text, "two copies of the same prompt should be the same"
+    assert batch.choices[1].text == batch.choices[
+        3].text, "two copies of the same prompt should be the same"
+
+    # test streaming
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+    )
+    texts = [""] * 2
+    async for chunk in batch:
+        assert len(chunk.choices) == 1
+        choice = chunk.choices[0]
+        texts[choice.index] += choice.text
+    assert texts[0] == texts[1]
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
index 4f32a6225..88f3fad4c 100644
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -32,7 +32,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
 
     output_processor = MultiStepOutputProcessor(
         detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
         seq_counter=seq_counter,
         get_tokenizer_for_seq=lambda _: mock_tokenizer(),
         stop_checker=stop_checker,
@@ -86,7 +86,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
 
     output_processor = MultiStepOutputProcessor(
         detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
         seq_counter=seq_counter,
         get_tokenizer_for_seq=lambda _: mock_tokenizer(),
         stop_checker=stop_checker,
@@ -148,7 +148,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
 
     output_processor = MultiStepOutputProcessor(
         detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
         seq_counter=seq_counter,
         get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
         stop_checker=stop_checker,
@@ -215,7 +215,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
 
     output_processor = MultiStepOutputProcessor(
         detokenizer=detokenizer,
-        scheduler=scheduler,
+        scheduler=[scheduler],
         seq_counter=seq_counter,
         get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
         stop_checker=stop_checker,
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index f4c0af1ad..3e80214f2 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -14,7 +14,7 @@ import torch
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from ...utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -77,7 +77,7 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init()
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
     yield
     ray.shutdown()
 
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index b05035713..4fe925495 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -16,7 +16,7 @@ from openai import BadRequestError
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -79,7 +79,7 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init()
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
     yield
     ray.shutdown()
 
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 7c7232dbc..f8aa1c914 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -5,14 +5,14 @@ import openai
 import pytest
 import ray
 
-from ...utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init()
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
     yield
     ray.shutdown()
 
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index fddfd7550..914ef6e19 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -6,7 +6,7 @@ import ray
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
-from ...utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -22,7 +22,7 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init()
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
     yield
     ray.shutdown()
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index a7f7fdae8..7200b94f8 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -24,13 +24,13 @@ TEST_IMAGE_URLS = [
 
 @pytest.fixture(scope="module")
 def ray_ctx():
-    ray.init()
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
     yield
     ray.shutdown()
 
 
 @pytest.fixture(scope="module")
-def server():
+def server(ray_ctx):
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 68802f0b8..86148291a 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -54,9 +54,9 @@ def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]):
     return new_execute_model
 
 
-def zero_kv_cache(cache_engine: CacheEngine):
-    assert cache_engine.gpu_cache
-    for key_blocks, value_blocks in cache_engine.gpu_cache:
+def zero_kv_cache(cache_engine: List[CacheEngine]):
+    assert cache_engine[0].gpu_cache
+    for key_blocks, value_blocks in cache_engine[0].gpu_cache:
         key_blocks.zero_()
         value_blocks.zero_()
 
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index c8f86133f..b2ebcc15c 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          tensorize_vllm_model)
 
 from ..conftest import VllmRunner, cleanup
-from ..utils import RemoteOpenAIServer
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 
 # yapf conflicts with isort for this docstring
 
@@ -220,6 +220,8 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
         json.dumps(model_loader_extra_config),
     ]
 
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+
     server = RemoteOpenAIServer(openai_args)
     print("Server ready.")
 
diff --git a/tests/utils.py b/tests/utils.py
index 09107b5e7..ad4d097b0 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -49,7 +49,6 @@ class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
     MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 
-    @ray.remote(num_gpus=1)
     class _RemoteRunner:
 
         def __init__(self, cli_args: List[str], *, wait_url: str,
@@ -92,7 +91,11 @@ class RemoteOpenAIServer:
             if hasattr(self, "proc"):
                 self.proc.terminate()
 
-    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
+    def __init__(self,
+                 cli_args: List[str],
+                 *,
+                 auto_port: bool = True,
+                 num_gpus: int = 1) -> None:
         if auto_port:
             if "-p" in cli_args or "--port" in cli_args:
                 raise ValueError("You have manually specified the port"
@@ -105,10 +108,11 @@ class RemoteOpenAIServer:
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
 
-        self._runner = self._RemoteRunner.remote(  # type: ignore
-            cli_args,
-            wait_url=self.url_for("health"),
-            wait_timeout=self.MAX_SERVER_START_WAIT_S)
+        self._runner = ray.remote(num_gpus=num_gpus)(
+            self._RemoteRunner).remote(
+                cli_args,
+                wait_url=self.url_for("health"),
+                wait_timeout=self.MAX_SERVER_START_WAIT_S)
 
         self._wait_until_ready()
 
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index d941ffdb5..7aa439ba0 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -39,8 +39,8 @@ def test_swap() -> None:
         num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
 
     # Randomly initialize the cache.
-    gpu_cache = worker.cache_engine.gpu_cache
-    cpu_cache = worker.cache_engine.cpu_cache
+    gpu_cache = worker.cache_engine[0].gpu_cache
+    cpu_cache = worker.cache_engine[0].cpu_cache
     num_layers = len(gpu_cache)
     for i in range(num_layers):
         gpu_key_cache, gpu_value_cache = gpu_cache[i]
diff --git a/vllm/config.py b/vllm/config.py
index 66338cb0d..9a7e0ea7a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -27,6 +27,17 @@ logger = init_logger(__name__)
 _GB = 1 << 30
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 
+_PP_SUPPORTED_MODELS = [
+    "AquilaModel",
+    "AquilaForCausalLM",
+    "InternLMForCausalLM",
+    "LlamaForCausalLM",
+    "LLaMAForCausalLM",
+    "MistralForCausalLM",
+    "Phi3ForCausalLM",
+    "GPT2LMHeadModel",
+]
+
 
 class ModelConfig:
     """Configuration for the model.
@@ -258,6 +269,13 @@ class ModelConfig:
         total_num_hidden_layers = getattr(self.hf_text_config,
                                           "num_hidden_layers", 0)
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
+        architectures = getattr(self.hf_config, "architectures", [])
+        if not all(arch in _PP_SUPPORTED_MODELS
+                   for arch in architectures) and pipeline_parallel_size > 1:
+            raise NotImplementedError(
+                "Pipeline parallelism is only supported for the following "
+                f" architectures: {_PP_SUPPORTED_MODELS}.")
+
         if total_num_hidden_layers % pipeline_parallel_size != 0:
             raise ValueError(
                 f"Total number of hidden layers ({total_num_hidden_layers}) "
@@ -665,9 +683,10 @@ class ParallelConfig:
         self._verify_args()
 
     def _verify_args(self) -> None:
-        if self.pipeline_parallel_size > 1:
-            raise NotImplementedError(
-                "Pipeline parallelism is not supported yet.")
+        if (self.pipeline_parallel_size > 1
+                and self.distributed_executor_backend == "mp"):
+            raise NotImplementedError("Pipeline parallelism is not supported "
+                                      "yet with multiprocessing.")
         if self.distributed_executor_backend not in ("ray", "mp", None):
             raise ValueError(
                 "Unrecognized distributed executor backend. Supported values "
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 995ea04a5..e29eba375 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -471,6 +471,9 @@ class BlockSpaceManagerV1(BlockSpaceManager):
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         # NOTE: fork does not allocate a new physical block.
         # Thus, it is always safe from OOM.
+        if parent_seq.seq_id not in self.block_tables:
+            # Parent sequence has either been freed or never existed.
+            return
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.copy()
         # When using a sliding window, blocks will be eventually reused.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 6a6eebc39..b48ea1b19 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -317,6 +317,9 @@ class BlockSpaceManagerV2(BlockSpaceManager):
             computed_seq_block_ids)  # type: ignore
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        if parent_seq.seq_id not in self.block_tables:
+            # Parent sequence has either been freed or never existed.
+            return
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 48c34625c..5fb3b7814 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -256,6 +256,7 @@ class Scheduler:
         scheduler_config: SchedulerConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
+        pipeline_parallel_size: int = 1,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
@@ -273,11 +274,19 @@ class Scheduler:
         BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
             version)
 
+        num_gpu_blocks = cache_config.num_gpu_blocks
+        if num_gpu_blocks:
+            num_gpu_blocks //= pipeline_parallel_size
+
+        num_cpu_blocks = cache_config.num_cpu_blocks
+        if num_cpu_blocks:
+            num_cpu_blocks //= pipeline_parallel_size
+
         # Create the block space manager.
         self.block_manager = BlockSpaceManagerImpl(
             block_size=self.cache_config.block_size,
-            num_gpu_blocks=self.cache_config.num_gpu_blocks,
-            num_cpu_blocks=self.cache_config.num_cpu_blocks,
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=num_cpu_blocks,
             sliding_window=self.cache_config.sliding_window,
             enable_caching=self.cache_config.enable_prefix_caching)
 
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 4ebb8703e..faf9177ad 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -416,7 +416,7 @@ class GroupCoordinator:
 
         assert dst < self.world_size, f"Invalid dst rank ({dst})"
 
-        assert dst != self.rank, (
+        assert dst != self.rank_in_group, (
             "Invalid destination rank. Destination rank is the same "
             "as the current rank.")
 
@@ -446,7 +446,7 @@ class GroupCoordinator:
 
         assert src < self.world_size, f"Invalid src rank ({src})"
 
-        assert src != self.rank, (
+        assert src != self.rank_in_group, (
             "Invalid source rank. Source rank is the same as the current rank."
         )
 
@@ -454,7 +454,7 @@ class GroupCoordinator:
 
         # Receive object size
         rank_size = torch.distributed.recv(size_tensor,
-                                           src=src,
+                                           src=self.ranks[src],
                                            group=self.cpu_group)
 
         # Tensor to receive serialized objects into.
@@ -464,7 +464,7 @@ class GroupCoordinator:
             device="cpu")
 
         rank_object = torch.distributed.recv(object_tensor,
-                                             src=src,
+                                             src=self.ranks[src],
                                              group=self.cpu_group)
 
         assert rank_object == rank_size, (
@@ -491,10 +491,9 @@ class GroupCoordinator:
         group = self.device_group
         metadata_group = self.cpu_group
         assert src < self.world_size, f"Invalid src rank ({src})"
-        src = self.ranks[src]
 
-        rank = self.rank
-        if rank == src:
+        rank_in_group = self.rank_in_group
+        if rank_in_group == src:
             metadata_list: List[Tuple[Any, Any]] = []
             assert isinstance(
                 tensor_dict,
@@ -512,13 +511,13 @@ class GroupCoordinator:
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
                     handle = torch.distributed.broadcast(tensor,
-                                                         src=src,
+                                                         src=self.ranks[src],
                                                          group=metadata_group,
                                                          async_op=True)
                 else:
                     # use group for GPU tensors
                     handle = torch.distributed.broadcast(tensor,
-                                                         src=src,
+                                                         src=self.ranks[src],
                                                          group=group,
                                                          async_op=True)
                 async_handles.append(handle)
@@ -542,15 +541,16 @@ class GroupCoordinator:
                         # use metadata_group for CPU tensors
                         handle = torch.distributed.broadcast(
                             tensor,
-                            src=src,
+                            src=self.ranks[src],
                             group=metadata_group,
                             async_op=True)
                     else:
                         # use group for GPU tensors
-                        handle = torch.distributed.broadcast(tensor,
-                                                             src=src,
-                                                             group=group,
-                                                             async_op=True)
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=group,
+                            async_op=True)
                     async_handles.append(handle)
                     _update_nested_dict(tensor_dict, key, tensor)
                 else:
@@ -575,7 +575,7 @@ class GroupCoordinator:
         metadata_group = self.cpu_group
 
         if dst is None:
-            dst = self.next_rank
+            dst = (self.rank_in_group + 1) % self.world_size
         assert dst < self.world_size, f"Invalid dst rank ({dst})"
 
         metadata_list: List[Tuple[Any, Any]] = []
@@ -593,10 +593,14 @@ class GroupCoordinator:
                 continue
             if tensor.is_cpu:
                 # use metadata_group for CPU tensors
-                torch.distributed.send(tensor, dst=dst, group=metadata_group)
+                torch.distributed.send(tensor,
+                                       dst=self.ranks[dst],
+                                       group=metadata_group)
             else:
                 # use group for GPU tensors
-                torch.distributed.send(tensor, dst=dst, group=group)
+                torch.distributed.send(tensor,
+                                       dst=self.ranks[dst],
+                                       group=group)
         return None
 
     def recv_tensor_dict(
@@ -614,7 +618,7 @@ class GroupCoordinator:
         metadata_group = self.cpu_group
 
         if src is None:
-            src = self.prev_rank
+            src = (self.rank_in_group - 1) % self.world_size
         assert src < self.world_size, f"Invalid src rank ({src})"
 
         recv_metadata_list = self.recv_object(src=src)
@@ -631,11 +635,13 @@ class GroupCoordinator:
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
                     torch.distributed.recv(tensor,
-                                           src=src,
+                                           src=self.ranks[src],
                                            group=metadata_group)
                 else:
                     # use group for GPU tensors
-                    torch.distributed.recv(tensor, src=src, group=group)
+                    torch.distributed.recv(tensor,
+                                           src=self.ranks[src],
+                                           group=group)
                 _update_nested_dict(tensor_dict, key, tensor)
             else:
                 _update_nested_dict(tensor_dict, key, value)
@@ -654,7 +660,7 @@ class GroupCoordinator:
         """Sends a tensor to the destination rank in a non-blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if dst is None:
-            dst = self.next_rank
+            dst = (self.rank_in_group + 1) % self.world_size
 
         pynccl_comm = self.pynccl_comm
         if pynccl_comm is not None and not pynccl_comm.disabled:
@@ -669,7 +675,7 @@ class GroupCoordinator:
         """Receives a tensor from the src rank."""
         """NOTE: `src` is the local rank of the destination rank."""
         if src is None:
-            src = self.prev_rank
+            src = (self.rank_in_group - 1) % self.world_size
 
         tensor = torch.empty(size, dtype=dtype, device=self.device)
         pynccl_comm = self.pynccl_comm
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 0cd420c8e..4e4206e58 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -2,7 +2,7 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-from typing import Sequence
+from typing import Sequence, Tuple
 
 import torch
 
@@ -46,3 +46,12 @@ def split_tensor_along_last_dim(
         return tuple(chunk.contiguous() for chunk in tensor_list)
 
     return tensor_list
+
+
+def get_pp_indices(num_hidden_layers: int, pp_rank: int,
+                   pp_size: int) -> Tuple[int, int]:
+    layers_per_partition = divide(num_hidden_layers, pp_size)
+    start_layer = pp_rank * layers_per_partition
+    end_layer = start_layer + layers_per_partition
+
+    return (start_layer, end_layer)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7db3bb28c..0ce511ce4 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -211,7 +211,8 @@ class _AsyncLLMEngine(LLMEngine):
     """Extension of LLMEngine to add async methods."""
 
     async def step_async(
-            self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+        self, virtual_engine: int
+    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
         The workers are ran asynchronously if possible.
 
@@ -221,7 +222,8 @@ class _AsyncLLMEngine(LLMEngine):
         and updates the scheduler with the model outputs. Finally, it decodes
         the sequences and returns the newly generated results.
         """
-        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+        seq_group_metadata_list, scheduler_outputs = self.scheduler[
+            virtual_engine].schedule()
 
         if not scheduler_outputs.is_empty():
             # Execute the model.
@@ -230,6 +232,7 @@ class _AsyncLLMEngine(LLMEngine):
                 blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
                 blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
                 blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                virtual_engine=virtual_engine,
                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
                 running_queue_size=scheduler_outputs.running_queue_size,
             )
@@ -248,16 +251,12 @@ class _AsyncLLMEngine(LLMEngine):
         # Tracing
         self.do_tracing(scheduler_outputs)
 
-        if not request_outputs:
-            # Stop the execute model loop in parallel workers until there are
-            # more requests to process. This avoids waiting indefinitely in
-            # torch.distributed ops which may otherwise timeout, and unblocks
-            # the RPC thread in the workers so that they can process any other
-            # queued control plane messages, such as add/remove lora adapters.
-            await self.model_executor.stop_remote_worker_execution_loop_async()
-
         return request_outputs
 
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        """Stop the remote worker execution loop."""
+        await self.model_executor.stop_remote_worker_execution_loop_async()
+
     async def process_model_inputs_async(
         self,
         request_id: str,
@@ -491,7 +490,8 @@ class AsyncLLMEngine:
             # order of the arguments.
             cache_config = kwargs["cache_config"]
             parallel_config = kwargs["parallel_config"]
-            if parallel_config.tensor_parallel_size == 1:
+            if (parallel_config.tensor_parallel_size == 1
+                    and parallel_config.pipeline_parallel_size == 1):
                 num_gpus = cache_config.gpu_memory_utilization
             else:
                 num_gpus = 1
@@ -499,7 +499,7 @@ class AsyncLLMEngine:
                 self._engine_class).remote
         return engine_class(*args, **kwargs)
 
-    async def engine_step(self) -> bool:
+    async def engine_step(self, virtual_engine: int) -> bool:
         """Kick the engine to process the waiting requests.
 
         Returns True if there are in-progress requests."""
@@ -530,7 +530,7 @@ class AsyncLLMEngine:
         if self.engine_use_ray:
             request_outputs = await self.engine.step.remote()  # type: ignore
         else:
-            request_outputs = await self.engine.step_async()
+            request_outputs = await self.engine.step_async(virtual_engine)
 
         # Put the outputs into the corresponding streams.
         for request_output in request_outputs:
@@ -546,18 +546,65 @@ class AsyncLLMEngine:
             self.engine.abort_request(request_ids)
 
     async def run_engine_loop(self):
-        has_requests_in_progress = False
+        if self.engine_use_ray:
+            pipeline_parallel_size = 1  # type: ignore
+        else:
+            pipeline_parallel_size = \
+                self.engine.parallel_config.pipeline_parallel_size
+        has_requests_in_progress = [False] * pipeline_parallel_size
         while True:
-            if not has_requests_in_progress:
+            if not any(has_requests_in_progress):
                 logger.debug("Waiting for new requests...")
+                # Stop the execute model loop in parallel workers until there
+                # are more requests to process. This avoids waiting
+                # indefinitely in torch.distributed ops which may otherwise
+                # timeout, and unblocks the RPC thread in the workers so that
+                # they can process any other queued control plane messages,
+                # such as add/remove lora adapters.
+                if self.engine_use_ray:
+                    await (self.engine.stop_remote_worker_execution_loop.
+                           remote()  # type: ignore
+                           )
+                else:
+                    await self.engine.stop_remote_worker_execution_loop_async()
                 await self._request_tracker.wait_for_new_requests()
                 logger.debug("Got new requests!")
+                requests_in_progress = [
+                    asyncio.create_task(self.engine_step(ve))
+                    for ve in range(pipeline_parallel_size)
+                ]
+                has_requests_in_progress = [True] * pipeline_parallel_size
 
             # Abort if iteration takes too long due to unrecoverable errors
             # (eg. NCCL timeouts).
             try:
                 async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
-                    has_requests_in_progress = await self.engine_step()
+                    done, _ = await asyncio.wait(
+                        requests_in_progress,
+                        return_when=asyncio.FIRST_COMPLETED)
+                    for _ in range(pipeline_parallel_size):
+                        await asyncio.sleep(0)
+                for task in done:
+                    result = task.result()
+                    virtual_engine = requests_in_progress.index(task)
+                    if self.engine_use_ray:
+                        has_unfinished_requests = (
+                            await (self.engine.
+                                   has_unfinished_requests_for_virtual_engine.
+                                   remote(  # type: ignore
+                                       virtual_engine)))
+                    else:
+                        has_unfinished_requests = (
+                            self.engine.
+                            has_unfinished_requests_for_virtual_engine(
+                                virtual_engine))
+                    if result or has_unfinished_requests:
+                        requests_in_progress[virtual_engine] = (
+                            asyncio.create_task(
+                                self.engine_step(virtual_engine)))
+                        has_requests_in_progress[virtual_engine] = True
+                    else:
+                        has_requests_in_progress[virtual_engine] = False
             except asyncio.TimeoutError as exc:
                 logger.error(
                     "Engine iteration timed out. This should never happen!")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index c13b17471..a79057005 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -173,6 +173,7 @@ class LLMEngine:
             "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
             "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
             "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
+            "pipeline_parallel_size=%d, "
             "disable_custom_all_reduce=%s, quantization=%s, "
             "enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
@@ -195,6 +196,7 @@ class LLMEngine:
             load_config.download_dir,
             load_config.load_format,
             parallel_config.tensor_parallel_size,
+            parallel_config.pipeline_parallel_size,
             parallel_config.disable_custom_all_reduce,
             model_config.quantization,
             model_config.enforce_eager,
@@ -296,7 +298,11 @@ class LLMEngine:
         # Create the scheduler.
         # NOTE: the cache_config here have been updated with the numbers of
         # GPU and CPU blocks, which are profiled in the distributed executor.
-        self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+        self.scheduler = [
+            Scheduler(scheduler_config, cache_config, lora_config,
+                      parallel_config.pipeline_parallel_size)
+            for _ in range(parallel_config.pipeline_parallel_size)
+        ]
 
         # Metric Logging.
         if self.log_stats:
@@ -513,8 +519,16 @@ class LLMEngine:
             raise ValueError(
                 "Either SamplingParams or PoolingParams must be provided.")
 
-        # Add the sequence group to the scheduler.
-        self.scheduler.add_seq_group(seq_group)
+        # Add the sequence group to the scheduler with least unfinished seqs.
+        costs = [
+            scheduler.get_num_unfinished_seq_groups()
+            for scheduler in self.scheduler
+        ]
+        min_cost_scheduler = self.scheduler[costs.index(min(costs))]
+        min_cost_scheduler.add_seq_group(seq_group)
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        self.model_executor.stop_remote_worker_execution_loop()
 
     def process_model_inputs(
         self,
@@ -684,7 +698,8 @@ class LLMEngine:
             >>> # abort the request
             >>> engine.abort_request(request_id)
         """
-        self.scheduler.abort_seq_group(request_id)
+        for scheduler in self.scheduler:
+            scheduler.abort_seq_group(request_id)
 
     def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
@@ -696,11 +711,20 @@ class LLMEngine:
 
     def get_num_unfinished_requests(self) -> int:
         """Gets the number of unfinished requests."""
-        return self.scheduler.get_num_unfinished_seq_groups()
+        return sum(scheduler.get_num_unfinished_seq_groups()
+                   for scheduler in self.scheduler)
 
     def has_unfinished_requests(self) -> bool:
         """Returns True if there are unfinished requests."""
-        return self.scheduler.has_unfinished_seqs()
+        return any(scheduler.has_unfinished_seqs()
+                   for scheduler in self.scheduler)
+
+    def has_unfinished_requests_for_virtual_engine(
+            self, virtual_engine: int) -> bool:
+        """
+        Returns True if there are unfinished requests for the virtual engine.
+        """
+        return self.scheduler[virtual_engine].has_unfinished_seqs()
 
     def _process_sequence_group_outputs(
         self,
@@ -749,7 +773,8 @@ class LLMEngine:
                 self.output_processor.process_outputs(seq_group, outputs)
 
         # Free the finished sequence groups.
-        self.scheduler.free_finished_seq_groups()
+        for scheduler in self.scheduler:
+            scheduler.free_finished_seq_groups()
 
         # Create the outputs.
         request_outputs: List[Union[RequestOutput,
@@ -815,7 +840,12 @@ class LLMEngine:
             >>>     if not (engine.has_unfinished_requests() or example_inputs):
             >>>         break
         """
-        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+        if self.parallel_config.pipeline_parallel_size > 1:
+            raise NotImplementedError(
+                "Pipeline parallelism is only supported through AsyncLLMEngine "
+                "as performance will be severely degraded otherwise.")
+        seq_group_metadata_list, scheduler_outputs = self.scheduler[
+            0].schedule()
 
         if not scheduler_outputs.is_empty():
             execute_model_req = ExecuteModelRequest(
@@ -886,23 +916,28 @@ class LLMEngine:
 
         # System State
         #   Scheduler State
-        num_running_sys = len(self.scheduler.running)
-        num_swapped_sys = len(self.scheduler.swapped)
-        num_waiting_sys = len(self.scheduler.waiting)
+        num_running_sys = sum(
+            len(scheduler.running) for scheduler in self.scheduler)
+        num_swapped_sys = sum(
+            len(scheduler.swapped) for scheduler in self.scheduler)
+        num_waiting_sys = sum(
+            len(scheduler.waiting) for scheduler in self.scheduler)
 
         # KV Cache Usage in %
         num_total_gpu = self.cache_config.num_gpu_blocks
         gpu_cache_usage_sys = 0.
         if num_total_gpu is not None:
-            num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks(
-            )
+            num_free_gpu = sum(
+                scheduler.block_manager.get_num_free_gpu_blocks()
+                for scheduler in self.scheduler)
             gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
 
         num_total_cpu = self.cache_config.num_cpu_blocks
         cpu_cache_usage_sys = 0.
         if num_total_cpu is not None and num_total_cpu > 0:
-            num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
-            )
+            num_free_cpu = sum(
+                scheduler.block_manager.get_num_free_cpu_blocks()
+                for scheduler in self.scheduler)
             cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
 
         # Iteration stats
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 9ddb6a364..92aecebe6 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -27,7 +27,7 @@ class SequenceGroupOutputProcessor(ABC):
     def create_output_processor(
         scheduler_config: SchedulerConfig,
         detokenizer: Detokenizer,
-        scheduler: Scheduler,
+        scheduler: List[Scheduler],
         seq_counter: Counter,
         get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
         stop_checker: "StopChecker",
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 8512ff83e..25d15df9f 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -34,7 +34,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
     def __init__(
         self,
         detokenizer: Detokenizer,
-        scheduler: Scheduler,
+        scheduler: List[Scheduler],
         seq_counter: Counter,
         get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
         stop_checker: StopChecker,
@@ -141,4 +141,5 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
                 break
 
         if seq.is_finished():
-            self.scheduler.free_seq(seq)
+            for scheduler in self.scheduler:
+                scheduler.free_seq(seq)
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 07a68c65a..fa672e1fe 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -33,7 +33,7 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
         self,
         scheduler_config: SchedulerConfig,
         detokenizer: Detokenizer,
-        scheduler: Scheduler,
+        scheduler: List[Scheduler],
         seq_counter: Counter,
         stop_checker: StopChecker,
     ):
@@ -95,7 +95,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
                 # not be used in the future iterations.
                 parent.status = SequenceStatus.FINISHED_ABORTED
                 seq_group.remove(parent.seq_id)
-                self.scheduler.free_seq(parent)
+                for scheduler in self.scheduler:
+                    scheduler.free_seq(parent)
                 continue
             # Fork the parent sequence if there are multiple child samples.
             for child_sample in child_samples[:-1]:
@@ -133,7 +134,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
                 if seq is not parent:
                     seq_group.add(seq)
                     if not seq.is_finished():
-                        self.scheduler.fork_seq(parent, seq)
+                        for scheduler in self.scheduler:
+                            scheduler.fork_seq(parent, seq)
 
             # Free the finished and selected parent sequences' memory in block
             # manager. Keep them in the sequence group as candidate output.
@@ -141,7 +143,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
             # old sequences.
             for seq, parent in child_seqs:
                 if seq is parent and seq.is_finished():
-                    self.scheduler.free_seq(seq)
+                    for scheduler in self.scheduler:
+                        scheduler.free_seq(seq)
             return
 
         # Beam search case
@@ -226,13 +229,15 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
             if seq is not parent:
                 seq_group.add(seq)
                 if not seq.is_finished():
-                    self.scheduler.fork_seq(parent, seq)
+                    for scheduler in self.scheduler:
+                        scheduler.fork_seq(parent, seq)
 
         # Free the finished and selected parent sequences' memory in block
         # manager. Keep them in the sequence group as candidate output.
         for seq, parent in selected_child_seqs:
             if seq is parent and seq.is_finished():
-                self.scheduler.free_seq(seq)
+                for scheduler in self.scheduler:
+                    scheduler.free_seq(seq)
 
         # Remove the unselected parent sequences from the sequence group and
         # free their memory in block manager.
@@ -241,7 +246,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
                 # Remove the parent sequence if it is not selected for next
                 # iteration
                 seq_group.remove(seq.seq_id)
-                self.scheduler.free_seq(seq)
+                for scheduler in self.scheduler:
+                    scheduler.free_seq(seq)
 
     def _check_beam_search_early_stopping(
         self,
diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
index d8693e636..3db82eb1f 100644
--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@@ -69,7 +69,7 @@ class DistributedGPUExecutor(GPUExecutor):
         if self.parallel_worker_tasks is None:
             self.parallel_worker_tasks = self._run_workers(
                 "start_worker_execution_loop",
-                async_run_remote_workers_only=True,
+                async_run_tensor_parallel_workers_only=True,
                 **self.extra_execute_model_run_workers_kwargs)
 
         # Only the driver worker returns the sampling results.
@@ -138,17 +138,17 @@ class DistributedGPUExecutor(GPUExecutor):
         self,
         method: str,
         *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers.
 
         Args:
-            async_run_remote_workers_only: If True the method will be run only
-                in the remote workers, not the driver worker. It will also be
-                run asynchronously and return a list of futures rather than
-                blocking on the results.
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
         """
         raise NotImplementedError
 
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index d7c19622e..9018c3295 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,3 +1,4 @@
+import asyncio
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
@@ -110,6 +111,30 @@ class ExecutorBase(ABC):
 
 class ExecutorAsyncBase(ExecutorBase):
 
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        speculative_config: Optional[SpeculativeConfig],
+    ) -> None:
+        # This locks each pipeline parallel stage so multiple virtual engines
+        # can't execute on the same stage at the same time
+        self.pp_locks = [
+            asyncio.Lock()
+            for _ in range(parallel_config.pipeline_parallel_size)
+        ]
+
+        super().__init__(model_config, cache_config, parallel_config,
+                         scheduler_config, device_config, load_config,
+                         lora_config, vision_language_config,
+                         speculative_config)
+
     @abstractmethod
     async def execute_model_async(
             self,
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 5522b5322..c2910ccdc 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -45,7 +45,8 @@ class GPUExecutor(ExecutorBase):
             lora_config=self.lora_config,
             vision_language_config=self.vision_language_config,
             speculative_config=self.speculative_config,
-            is_driver_worker=rank == 0,
+            is_driver_worker=(not self.parallel_config)
+            or (rank % self.parallel_config.tensor_parallel_size == 0),
         )
 
     def _create_worker(self,
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 6aebb4702..5bfeac0cf 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -91,17 +91,17 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
         self,
         method: str,
         *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers.
 
         Args:
-            async_run_remote_workers_only: If True the method will be run only
-                in the remote workers, not the driver worker. It will also be
-                run asynchronously and return a list of futures rather than
-                blocking on the results.
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
         """
 
         if max_concurrent_workers:
@@ -114,7 +114,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
             for worker in self.workers
         ]
 
-        if async_run_remote_workers_only:
+        if async_run_tensor_parallel_workers_only:
             # Just return futures
             return worker_outputs
 
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index faa500c2d..e742d11bb 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -62,7 +62,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
 
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
-        if self.parallel_config.tensor_parallel_size == 1:
+        if (self.parallel_config.tensor_parallel_size == 1
+                and self.parallel_config.pipeline_parallel_size == 1):
             # For single GPU case, we use a ray worker with constrained memory.
             num_gpus = self.cache_config.gpu_memory_utilization
         else:
@@ -189,6 +190,26 @@ class RayGPUExecutor(DistributedGPUExecutor):
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
 
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[RayWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[RayWorkerWrapper] = []
+
+        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+            for tp_rank in range(self.parallel_config.tensor_parallel_size):
+                rank = (pp_rank *
+                        self.parallel_config.tensor_parallel_size) + tp_rank
+                if rank == 0:
+                    pass
+                elif rank % self.parallel_config.tensor_parallel_size == 0:
+                    self.tp_driver_workers.append(self.workers[rank - 1])
+                else:
+                    self.non_driver_workers.append(self.workers[rank - 1])
+
     def _driver_execute_model(
         self, execute_model_req: Optional[ExecuteModelRequest]
     ) -> Optional[List[SamplerOutput]]:
@@ -204,7 +225,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
         self,
         method: str,
         *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
         use_dummy_driver: bool = False,
@@ -215,10 +236,11 @@ class RayGPUExecutor(DistributedGPUExecutor):
         """Runs the given method on all workers. Can be used in the following
         ways:
 
-        - async_run_remote_workers_only: If True the method will be run only
-          in the remote workers, not the driver worker. It will also be
-          run asynchronously and return a list of futures rather than blocking
-          on the results.
+        Args:
+        - async_run_tensor_parallel_workers_only: If True the method will be
+          run only in the remote TP workers, not the driver worker.
+          It will also be run asynchronously and return a list of futures
+          rather than blocking on the results.
         - args/kwargs: All workers share the same args/kwargs
         - all_args/all_kwargs: args/kwargs for each worker are specified
           individually
@@ -228,7 +250,9 @@ class RayGPUExecutor(DistributedGPUExecutor):
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
-        count = len(self.workers)
+        count = len(self.workers) if not \
+            async_run_tensor_parallel_workers_only \
+            else len(self.non_driver_workers)
         all_worker_args = repeat(args, count) if all_args is None \
             else islice(all_args, 1, None)
         all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
@@ -242,14 +266,17 @@ class RayGPUExecutor(DistributedGPUExecutor):
             ray_worker_outputs = []
         else:
             # Start the ray workers first.
+            ray_workers = self.workers
+            if async_run_tensor_parallel_workers_only:
+                ray_workers = self.non_driver_workers
             ray_worker_outputs = [
                 worker.execute_method.remote(method, *worker_args,
                                              **worker_kwargs)
                 for (worker, worker_args, worker_kwargs
-                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+                     ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
             ]
 
-        if async_run_remote_workers_only:
+        if async_run_tensor_parallel_workers_only:
             # Just return futures
             return ray_worker_outputs
 
@@ -319,12 +346,32 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
     ) -> List[SamplerOutput]:
-        return await self.driver_exec_method("execute_model",
-                                             execute_model_req)
+
+        async def _run_task_with_lock(task, lock, *args, **kwargs):
+            async with lock:
+                return await task(*args, **kwargs)
+
+        tasks = []
+        tasks.append(
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
+                                    "execute_model", execute_model_req)))
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method.remote,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+
+        results = await asyncio.gather(*tasks)
+
+        # Only the last PP stage has the final results.
+        return results[-1]
 
     async def _start_worker_execution_loop(self):
         coros = [
             worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.workers
+            for worker in self.non_driver_workers
         ]
         return await asyncio.gather(*coros)
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 577761107..fec52e016 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -29,7 +29,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs.arctic import ArcticConfig
 
 logger = init_logger(__name__)
@@ -426,6 +426,7 @@ class ArcticForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 5cf5a199b..ddc4e9084 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -43,7 +43,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
 
@@ -338,6 +338,7 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index a29aee4cf..8387c8e37 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
@@ -286,6 +286,7 @@ class BloomForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 5b5a69447..e6012a6d4 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -25,7 +25,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs import ChatGLMConfig
 
 from .interfaces import SupportsLoRA
@@ -365,6 +365,7 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 600c2990b..2961f421e 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -46,7 +46,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 @torch.compile
@@ -353,6 +353,7 @@ class CohereForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 59af42445..210cf6165 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -23,7 +23,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 
 
@@ -381,6 +381,7 @@ class DbrxForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 8fbda2638..e9ceca9b1 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -48,7 +48,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class DeepseekMLP(nn.Module):
@@ -387,6 +387,7 @@ class DeepseekForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 3d4f78c66..3cf62afd9 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -48,7 +48,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class DeepseekV2MLP(nn.Module):
@@ -475,6 +475,7 @@ class DeepseekV2ForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 9618652f7..89b0bbf01 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -44,7 +44,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs import RWConfig
 
 FalconConfig = Union[HF_FalconConfig, RWConfig]
@@ -410,6 +410,7 @@ class FalconForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(
             input_ids,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index efefb3481..0a5a7ed3d 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
 
@@ -339,6 +339,7 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 4e35a9ec3..1f921c8bd 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -37,7 +37,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsLoRA
@@ -338,6 +338,7 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index cc83f6eb6..55f2e2741 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -25,7 +25,9 @@ from transformers import GPT2Config
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import (
+    get_pp_group, get_tensor_model_parallel_world_size)
+from vllm.distributed.utils import get_pp_indices
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
@@ -38,7 +40,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class GPT2Attention(nn.Module):
@@ -181,10 +183,18 @@ class GPT2Model(nn.Module):
         self.embed_dim = config.hidden_size
         self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-        self.h = nn.ModuleList([
-            GPT2Block(config, cache_config, quant_config)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer = get_pp_indices(
+            config.num_hidden_layers,
+            get_pp_group().rank_in_group,
+            get_pp_group().world_size)
+        self.h = nn.ModuleList(
+            [nn.Identity() for _ in range(self.start_layer)] + [
+                GPT2Block(config, cache_config, quant_config)
+                for _ in range(self.start_layer, self.end_layer)
+            ] + [
+                nn.Identity()
+                for _ in range(self.end_layer, config.num_hidden_layers)
+            ])
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
     def forward(
@@ -193,14 +203,24 @@ class GPT2Model(nn.Module):
         position_ids: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
+        intermediate_tensors: Optional[IntermediateTensors],
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            inputs_embeds = self.wte(input_ids)
+            position_embeds = self.wpe(position_ids)
+            hidden_states = inputs_embeds + position_embeds
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
 
-        for i in range(len(self.h)):
+        for i in range(self.start_layer, self.end_layer):
             layer = self.h[i]
-            hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
+            hidden_states = layer(hidden_states,
+                                  kv_caches[i - self.start_layer],
+                                  attn_metadata)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
 
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
@@ -228,9 +248,10 @@ class GPT2LMHeadModel(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata)
+                                         attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -247,6 +268,16 @@ class GPT2LMHeadModel(nn.Module):
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         for name, loaded_weight in weights:
@@ -260,16 +291,19 @@ class GPT2LMHeadModel(nn.Module):
                 continue
             if not name.startswith("transformer."):
                 name = "transformer." + name
-            param = params_dict[name]
-            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
-            # Because of this, we need to transpose the weights.
-            # Note(zhuohan): the logic below might break quantized models.
-            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
-                if conv1d_weight_name not in name:
-                    continue
-                if not name.endswith(".weight"):
-                    continue
-                loaded_weight = loaded_weight.t()
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
+            try:
+                param = params_dict[name]
+                # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+                # Because of this, we need to transpose the weights.
+                # Note(zhuohan): the logic below might break quantized models.
+                for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+                    if conv1d_weight_name not in name:
+                        continue
+                    if not name.endswith(".weight"):
+                        continue
+                    loaded_weight = loaded_weight.t()
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            except KeyError:
+                continue
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 17bbe4e31..7d0bf39c5 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
 
@@ -273,6 +273,7 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 47fd5788a..de7f86af7 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -38,7 +38,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class GPTJAttention(nn.Module):
@@ -239,6 +239,7 @@ class GPTJForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index eb0fcc8f2..3658b8fbf 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -38,7 +38,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class GPTNeoXAttention(nn.Module):
@@ -251,6 +251,7 @@ class GPTNeoXForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
                                       attn_metadata)
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index e75c567f5..283bc064b 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -22,7 +22,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class InternLM2MLP(nn.Module):
@@ -263,6 +263,7 @@ class InternLM2ForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: IntermediateTensors,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 869b8fc91..2758e2d0b 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs import JAISConfig
 
 
@@ -289,6 +289,7 @@ class JAISLMHeadModel(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 54d01701f..af75b6bee 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -29,7 +29,8 @@ from transformers import LlamaConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
+from vllm.distributed import (get_pp_group, get_pp_indices,
+                              get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -46,7 +47,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, kv_cache_scales_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import is_hip, print_warning_once
 
 from .interfaces import SupportsLoRA
@@ -261,12 +262,20 @@ class LlamaModel(nn.Module):
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config=config,
-                              cache_config=cache_config,
-                              quant_config=quant_config)
-            for idx in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer = get_pp_indices(
+            config.num_hidden_layers,
+            get_pp_group().rank_in_group,
+            get_pp_group().world_size)
+        self.layers = nn.ModuleList(
+            [nn.Identity() for _ in range(self.start_layer)] + [
+                LlamaDecoderLayer(config=config,
+                                  cache_config=cache_config,
+                                  quant_config=quant_config)
+                for _ in range(self.start_layer, self.end_layer)
+            ] + [
+                nn.Identity()
+                for _ in range(self.end_layer, config.num_hidden_layers)
+            ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -278,22 +287,36 @@ class LlamaModel(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
         else:
-            hidden_states = self.get_input_embeddings(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i],
+                kv_caches[i - self.start_layer],
                 attn_metadata,
                 residual,
             )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -372,10 +395,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
-        return hidden_states
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, kv_caches,
+                                  attn_metadata, intermediate_tensors)
+        return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
@@ -391,6 +415,20 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -416,9 +454,12 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                try:
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                except KeyError:
+                    pass
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
@@ -437,10 +478,13 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                         continue
                     else:
                         name = remapped_kv_scale_name
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                try:
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+                except KeyError:
+                    pass
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
@@ -452,7 +496,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                 quantization_param_path, tp_rank, tp_size,
                 self.config.num_hidden_layers,
                 self.config.__class__.model_type):
-            layer_self_attn = self.model.layers[layer_idx].self_attn
+            if not isinstance(self.model.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.model.layers[layer_idx].self_attn
 
             if is_hip():
                 # The scaling factor convention we are assuming is
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index e0134c5c4..39c47dddf 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -18,7 +18,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsVision
@@ -202,6 +202,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         **kwargs: object,
     ) -> SamplerOutput:
         """Run forward pass for LLaVA-1.5.
@@ -247,6 +248,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
                                             positions,
                                             kv_caches,
                                             attn_metadata,
+                                            None,
                                             inputs_embeds=inputs_embeds)
 
         return hidden_states
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 3c0988137..8b078391b 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -22,7 +22,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
                    get_clip_patch_grid_length)
@@ -376,6 +376,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         **kwargs: object,
     ) -> SamplerOutput:
         """Run forward pass for LlaVA-NeXT.
@@ -430,6 +431,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
                                             positions,
                                             kv_caches,
                                             attn_metadata,
+                                            None,
                                             inputs_embeds=inputs_embeds)
 
         return hidden_states
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index a76ed0498..330204327 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -50,7 +50,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
 
@@ -462,6 +462,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a662db6d2..05c36b9c0 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -51,7 +51,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsLoRA
@@ -536,6 +536,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 1894c05e1..dde2da20b 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -47,7 +47,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class MixtralMLP(nn.Module):
@@ -354,6 +354,7 @@ class MixtralForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 5f9e4d86f..28dc5922c 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -22,7 +22,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 
@@ -273,6 +273,7 @@ class MPTForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 39270f71e..53215f32b 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -43,7 +43,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class OlmoAttention(nn.Module):
@@ -301,6 +301,7 @@ class OlmoForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 4bf59105d..d12a51af5 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class OPTLearnedPositionalEmbedding(nn.Embedding):
@@ -304,6 +304,7 @@ class OPTForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 133a10e6b..a298f0307 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -26,7 +26,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class OrionMLP(nn.Module):
@@ -269,6 +269,7 @@ class OrionForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 008fceb62..cc8e31fe1 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -57,7 +57,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
 
@@ -278,6 +278,7 @@ class PhiForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 0c5298eb6..706ae6520 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 def load_column_parallel_weight(param: torch.nn.Parameter,
@@ -412,6 +412,7 @@ class Phi3SmallForCausalLM(nn.Module):
         positions: Optional[torch.LongTensor],
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         output_hidden_states = self.model(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a16f7f0ea..eff4e5029 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -35,7 +35,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsVision
@@ -381,9 +381,13 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
 
         return None
 
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata, **kwargs: object):
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                **kwargs: object):
         image_input = self._parse_and_validate_image_input(**kwargs)
 
         if image_input is not None:
@@ -398,6 +402,7 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
                                    positions,
                                    kv_caches,
                                    attn_metadata,
+                                   intermediate_tensors,
                                    inputs_embeds=inputs_embeds)
 
         return hidden_states
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index b6ea6ab39..408c206c5 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -27,7 +27,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
 
@@ -245,6 +245,7 @@ class QWenLMHeadModel(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e2d725af6..3691a3d2e 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -45,7 +45,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsLoRA
@@ -331,6 +331,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 564536f2d..b3e7dfef9 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -50,7 +50,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class Qwen2MoeMLP(nn.Module):
@@ -397,6 +397,7 @@ class Qwen2MoeForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a6ed3800b..1098b3031 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -41,7 +41,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class StablelmMLP(nn.Module):
@@ -250,6 +250,7 @@ class StablelmForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 4324bf50d..6f3d5d51d 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 
 class Starcoder2Attention(nn.Module):
@@ -262,6 +262,7 @@ class Starcoder2ForCausalLM(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index b61721999..08d3efd33 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -43,7 +43,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
 
@@ -320,6 +320,7 @@ class XverseForCausalLM(nn.Module, SupportsLoRA):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 3e7c31b8c..b036e76d7 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -770,6 +770,34 @@ class EmbeddingSequenceGroupOutput(SequenceGroupOutput):
         return self.embeddings == other.embeddings
 
 
+@dataclass
+class IntermediateTensors:
+    """For all pipeline stages except the last, we need to return the hidden
+    states and residuals to be sent to the next stage. This data structure
+    contains the hidden states and residuals for a request.
+    """
+
+    tensors: Dict[str, torch.Tensor]
+
+    def __getitem__(self, key: Union[str, slice]):
+        if isinstance(key, str):
+            return self.tensors[key]
+        elif isinstance(key, slice):
+            return self.__class__({k: v[key] for k, v in self.tensors.items()})
+
+    def __setitem__(self, key: str, value):
+        self.tensors[key] = value
+
+    def __len__(self):
+        return len(self.tensors)
+
+    def __eq__(self, other: object):
+        return isinstance(other, self.__class__) and self
+
+    def __repr__(self) -> str:
+        return f"IntermediateTensors(tensors={self.tensors})"
+
+
 @dataclass
 class SamplerOutput:
     """For each sequence group, we generate a list of SequenceOutput object,
@@ -896,6 +924,8 @@ class ExecuteModelRequest:
     blocks_to_swap_out: List[Tuple[int, int]] = field(default_factory=list)
     # Blocks to copy. Source to dest block.
     blocks_to_copy: List[Tuple[int, int]] = field(default_factory=list)
+    # Virtual engine ID for pipeline parallel.
+    virtual_engine: int = 0
     # The number of slots for lookahead decoding.
     num_lookahead_slots: int = 0
     # The number of requests in the running queue.
@@ -914,6 +944,7 @@ class ExecuteModelRequest:
             blocks_to_swap_in=self.blocks_to_swap_in.copy(),
             blocks_to_swap_out=self.blocks_to_swap_out.copy(),
             blocks_to_copy=self.blocks_to_copy.copy(),
+            virtual_engine=self.virtual_engine,
             num_lookahead_slots=self.num_lookahead_slots,
             running_queue_size=self.running_queue_size,
             previous_hidden_states=self.previous_hidden_states,
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index f30d29376..b4c953162 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -6,7 +6,8 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.logger import init_logger
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
                                       ModelRunner)
 
@@ -74,9 +75,9 @@ class TP1DraftModelRunner(ModelRunner):
             List[SequenceGroupMetadata]] = None
 
     def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> ModelInputForGPUWithSamplingMetadata:
+            self,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            virtual_engine: int = 0) -> ModelInputForGPUWithSamplingMetadata:
         """A temporary solution that caches the seq_group_metadata_list
         for multi-step execution.
         TODO: In-place update model_input and remove this function.
@@ -115,6 +116,7 @@ class TP1DraftModelRunner(ModelRunner):
         self,
         model_input: ModelInputForGPUWithSamplingMetadata,
         kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
     ) -> Optional[List[SamplerOutput]]:
         # Since we do not broadcast data inside execute_model anymore,
@@ -130,6 +132,7 @@ class TP1DraftModelRunner(ModelRunner):
             self.set_active_loras(model_input.lora_requests,
                                   model_input.lora_mapping)
 
+        virtual_engine = model_input.virtual_engine
         outputs: List[SamplerOutput] = []
         for step in range(num_steps):
             # Currently cuda graph is only supported by the decode phase.
@@ -139,7 +142,8 @@ class TP1DraftModelRunner(ModelRunner):
             if prefill_meta is None and decode_meta.use_cuda_graph:
                 assert model_input.input_tokens is not None
                 graph_batch_size = model_input.input_tokens.shape[0]
-                model_executable = self.graph_runners[graph_batch_size]
+                model_executable = (
+                    self.graph_runners[virtual_engine][graph_batch_size])
             else:
                 model_executable = self.model
 
@@ -149,6 +153,7 @@ class TP1DraftModelRunner(ModelRunner):
                 positions=model_input.input_positions,
                 kv_caches=kv_caches,
                 attn_metadata=model_input.attn_metadata,
+                intermediate_tensors=intermediate_tensors,
                 **multi_modal_kwargs,
             )
 
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index fbd1343fe..891e74f8a 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -38,7 +38,11 @@ class CacheEngine:
 
         self.block_size = cache_config.block_size
         self.num_gpu_blocks = cache_config.num_gpu_blocks
+        if self.num_gpu_blocks:
+            self.num_gpu_blocks //= parallel_config.pipeline_parallel_size
         self.num_cpu_blocks = cache_config.num_cpu_blocks
+        if self.num_cpu_blocks:
+            self.num_cpu_blocks //= parallel_config.pipeline_parallel_size
 
         if cache_config.cache_dtype == "auto":
             self.dtype = model_config.dtype
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index b83cc6f09..f46e9e8ab 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -13,7 +13,8 @@ from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.utils import make_tensor_with_pad
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
@@ -315,6 +316,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
     ) -> CPUModelInput:
         multi_modal_kwargs = None
         # NOTE: We assume that all sequences in the group are all prompts or
@@ -351,6 +353,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
         self,
         model_input: CPUModelInput,
         kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
     ) -> Optional[List[SamplerOutput]]:
         if num_steps > 1:
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 30ee262c7..8089abd69 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -167,8 +167,8 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
             is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine: CPUCacheEngine
-        self.cpu_cache: List[torch.Tensor]
+        self.cache_engine: List[CPUCacheEngine]
+        self.cpu_cache: List[List[torch.Tensor]]
 
     def init_device(self) -> None:
         self.init_distributed_environment()
@@ -242,25 +242,32 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
                 "initializing the engine.")
 
     def _init_cache_engine(self) -> None:
-        self.cache_engine = CPUCacheEngine(self.cache_config,
-                                           self.model_config,
-                                           self.parallel_config,
-                                           self.device_config)
-        self.cpu_cache = self.cache_engine.cpu_cache
-        self.model_runner.block_size = self.cache_engine.block_size
-
-        assert self.cpu_cache is not None
+        self.cache_engine = [
+            CPUCacheEngine(self.cache_config, self.model_config,
+                           self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.cpu_cache = [
+            self.cache_engine[ve].cpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.model_runner.block_size = self.cache_engine[0].block_size
+
+        assert all(
+            self.cpu_cache[ve] is not None
+            for ve in range(self.parallel_config.pipeline_parallel_size))
 
         # Populate the cache to warmup the memory
-        for layer_cache in self.cpu_cache:
-            layer_cache.fill_(0)
+        for ve in range(self.parallel_config.pipeline_parallel_size):
+            for layer_cache in self.cpu_cache[ve]:
+                layer_cache.fill_(0)
 
     @property
     def do_metadata_broadcast(self) -> bool:
         return self.parallel_config.tensor_parallel_size > 1
 
     @property
-    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
         return self.cpu_cache
 
     def execute_worker(
@@ -269,12 +276,14 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     ) -> None:
         if (worker_input.blocks_to_copy is not None
                 and worker_input.blocks_to_copy.numel() > 0):
-            self.cache_engine.copy(worker_input.blocks_to_copy)
+            self.cache_engine[worker_input.virtual_engine].copy(
+                worker_input.blocks_to_copy)
 
     @torch.inference_mode()
     def prepare_worker_input(
             self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
         assert execute_model_req is not None
+        virtual_engine = execute_model_req.virtual_engine
         num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
         blocks_to_copy = execute_model_req.blocks_to_copy
         blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
@@ -285,6 +294,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         return WorkerInput(
             num_seq_groups=num_seq_groups,
             blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
         )
 
     def init_distributed_environment(self) -> None:
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 272917c72..faf6e99ab 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -9,7 +9,8 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
 from vllm.logger import init_logger
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.pooling_params import PoolingParams
-from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
+                           SequenceGroupMetadata)
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelInputForGPU
 
 logger = init_logger(__name__)
@@ -57,6 +58,7 @@ class EmbeddingModelRunner(
         self,
         model_input: ModelInputForGPUWithPoolingMetadata,
         kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
     ) -> Optional[List[PoolerOutput]]:
         if num_steps > 1:
@@ -73,10 +75,12 @@ class EmbeddingModelRunner(
         assert model_input.attn_metadata is not None
         prefill_meta = model_input.attn_metadata.prefill_metadata
         decode_meta = model_input.attn_metadata.decode_metadata
+        virtual_engine = model_input.virtual_engine
         if prefill_meta is None and decode_meta.use_cuda_graph:
             assert model_input.input_tokens is not None
             graph_batch_size = model_input.input_tokens.shape[0]
-            model_executable = self.graph_runners[graph_batch_size]
+            model_executable = self.graph_runners[virtual_engine][
+                graph_batch_size]
         else:
             model_executable = self.model
 
@@ -115,6 +119,7 @@ class EmbeddingModelRunner(
     def prepare_model_input(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        virtual_engine: int = 0,
     ) -> ModelInputForGPUWithPoolingMetadata:
         assert seq_group_metadata_list is not None
         model_input = self._prepare_model_input_tensors(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0b20d5010..28b447c0d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type,
 
 import numpy as np
 import torch
+import torch.distributed
 import torch.nn as nn
 
 try:
@@ -25,6 +26,7 @@ from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
+from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
@@ -37,7 +39,8 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models.interfaces import supports_lora
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                         is_pin_memory_available, make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
@@ -81,6 +84,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
     lora_requests: Optional[Set[LoRARequest]] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+    virtual_engine: int = 0
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -89,6 +93,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
+            "virtual_engine": self.virtual_engine,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -122,6 +127,7 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
+            "virtual_engine": self.virtual_engine,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,
@@ -179,7 +185,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.graph_runners: Dict[int, CUDAGraphRunner] = {}
+
+        self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
+            {} for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
         self.graph_memory_pool: Optional[Tuple[
             int, int]] = None  # Set during graph capture.
         # When using CUDA graph, the input block tables must be padded to
@@ -787,9 +796,11 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             max_num_seqs = min(
                 max_num_seqs,
                 int(max_num_batched_tokens / vlm_config.image_feature_size))
+        batch_size = 0
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
+            batch_size += seq_len
 
             seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
                 .dummy_data_for_profiling(model_config, seq_len)
@@ -811,7 +822,13 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
         model_input = self.prepare_model_input(seqs)
-        self.execute_model(model_input, kv_caches)
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = self.model.make_empty_intermediate_tensors(
+                batch_size=batch_size,
+                dtype=self.model_config.dtype,
+                device=self.device)
+        self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
         return
 
@@ -847,7 +864,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         return self.lora_manager.list_loras()
 
     @torch.inference_mode()
-    def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
+    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         """Cuda graph capture a model.
 
         Note that CUDA graph's performance gain is negligible if number
@@ -880,10 +897,18 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         slot_mapping.fill_(_PAD_SLOT_ID)
         seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        intermediate_inputs = None
+        if not get_pp_group().is_first_rank:
+            intermediate_inputs = self.model.make_empty_intermediate_tensors(
+                batch_size=max_batch_size,
+                dtype=self.model_config.dtype,
+                device=self.device)
 
         # Prepare buffer for outputs. These will be reused for all batch sizes.
         # It will be filled after the first graph capture.
-        hidden_states: Optional[torch.Tensor] = None
+        hidden_or_intermediate_states: List[Optional[torch.Tensor]] = [
+            None
+        ] * self.parallel_config.pipeline_parallel_size
 
         graph_batch_size = _get_graph_batch_size(
             self.scheduler_config.max_num_seqs)
@@ -912,109 +937,120 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         with graph_capture() as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
-            for batch_size in reversed(batch_size_capture_list):
-                if self.attn_backend.get_name() == "flashinfer":
-                    indptr_buffer = indptr_buffer[:batch_size + 1]
-                    last_page_len_buffer = last_page_len_buffer[:batch_size]
-
-                    num_qo_heads = self.model_config.get_num_attention_heads(
-                        self.parallel_config)
-                    num_kv_heads = self.model_config.get_num_kv_heads(
-                        self.parallel_config)
-                    if num_qo_heads // num_kv_heads >= 4:
-                        use_tensor_cores = True
+            for virtual_engine in range(
+                    self.parallel_config.pipeline_parallel_size):
+                for batch_size in reversed(batch_size_capture_list):
+                    if self.attn_backend.get_name() == "flashinfer":
+                        indptr_buffer = indptr_buffer[:batch_size + 1]
+                        last_page_len_buffer = last_page_len_buffer[:
+                                                                    batch_size]
+
+                        num_qo_heads = (
+                            self.model_config.get_num_attention_heads(
+                                self.parallel_config))
+                        num_kv_heads = self.model_config.get_num_kv_heads(
+                            self.parallel_config)
+                        if num_qo_heads // num_kv_heads >= 4:
+                            use_tensor_cores = True
+                        else:
+                            use_tensor_cores = False
+                        decode_wrapper = \
+                            CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
+                            decode_workspace_buffer, indptr_buffer,
+                            indices_buffer, last_page_len_buffer, "NHD",
+                            use_tensor_cores)
+                        kv_cache_dtype = get_kv_cache_torch_dtype(
+                            self.kv_cache_dtype, self.model_config.dtype)
+
+                        paged_kv_indptr_tensor_host = torch.arange(
+                            0, batch_size + 1, dtype=torch.int32)
+                        paged_kv_indices_tensor_host = torch.arange(
+                            0, batch_size, dtype=torch.int32)
+                        paged_kv_last_page_len_tensor_host = torch.full(
+                            (batch_size, ), self.block_size, dtype=torch.int32)
+                        query_start_loc_host = torch.arange(0,
+                                                            batch_size + 1,
+                                                            dtype=torch.int32)
+
+                        attn_metadata = self.attn_backend.make_metadata(
+                            num_prefills=0,
+                            slot_mapping=slot_mapping[:batch_size],
+                            num_prefill_tokens=0,
+                            num_decode_tokens=batch_size,
+                            max_prefill_seq_len=0,
+                            block_tables=block_tables,
+                            paged_kv_indptr=paged_kv_indptr_tensor_host,
+                            paged_kv_indices=paged_kv_indices_tensor_host,
+                            paged_kv_last_page_len=
+                            paged_kv_last_page_len_tensor_host,
+                            num_qo_heads=num_qo_heads,
+                            num_kv_heads=num_kv_heads,
+                            head_dim=self.model_config.get_head_size(),
+                            page_size=self.block_size,
+                            seq_start_loc=None,
+                            query_start_loc=query_start_loc_host,
+                            device=self.device,
+                            data_type=kv_cache_dtype,
+                            use_cuda_graph=True,
+                            decode_wrapper=decode_wrapper,
+                            prefill_wrapper=None)
+                        attn_metadata.begin_forward()
                     else:
-                        use_tensor_cores = False
-                    decode_wrapper = \
-                        CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
-                        decode_workspace_buffer, indptr_buffer, indices_buffer,
-                        last_page_len_buffer, "NHD", use_tensor_cores)
-                    kv_cache_dtype = get_kv_cache_torch_dtype(
-                        self.kv_cache_dtype, self.model_config.dtype)
-
-                    paged_kv_indptr_tensor_host = torch.arange(
-                        0, batch_size + 1, dtype=torch.int32)
-                    paged_kv_indices_tensor_host = torch.arange(
-                        0, batch_size, dtype=torch.int32)
-                    paged_kv_last_page_len_tensor_host = torch.full(
-                        (batch_size, ), self.block_size, dtype=torch.int32)
-                    query_start_loc_host = torch.arange(0,
-                                                        batch_size + 1,
-                                                        dtype=torch.int32)
-
-                    attn_metadata = self.attn_backend.make_metadata(
-                        num_prefills=0,
-                        slot_mapping=slot_mapping[:batch_size],
-                        num_prefill_tokens=0,
-                        num_decode_tokens=batch_size,
-                        max_prefill_seq_len=0,
-                        block_tables=block_tables,
-                        paged_kv_indptr=paged_kv_indptr_tensor_host,
-                        paged_kv_indices=paged_kv_indices_tensor_host,
-                        paged_kv_last_page_len=
-                        paged_kv_last_page_len_tensor_host,
-                        num_qo_heads=num_qo_heads,
-                        num_kv_heads=num_kv_heads,
-                        head_dim=self.model_config.get_head_size(),
-                        page_size=self.block_size,
-                        seq_start_loc=None,
-                        query_start_loc=query_start_loc_host,
-                        device=self.device,
-                        data_type=kv_cache_dtype,
-                        use_cuda_graph=True,
-                        decode_wrapper=decode_wrapper,
-                        prefill_wrapper=None)
-                    attn_metadata.begin_forward()
-                else:
-                    attn_metadata = self.attn_backend.make_metadata(
-                        num_prefills=0,
-                        num_prefill_tokens=0,
-                        num_decode_tokens=batch_size,
-                        slot_mapping=slot_mapping[:batch_size],
-                        seq_lens=None,
-                        seq_lens_tensor=seq_lens[:batch_size],
-                        max_query_len=None,
-                        max_prefill_seq_len=0,
-                        max_decode_seq_len=self.max_seq_len_to_capture,
-                        query_start_loc=None,
-                        seq_start_loc=None,
-                        context_lens_tensor=None,
-                        block_tables=block_tables[:batch_size],
-                        use_cuda_graph=True,
+                        attn_metadata = self.attn_backend.make_metadata(
+                            num_prefills=0,
+                            num_prefill_tokens=0,
+                            num_decode_tokens=batch_size,
+                            slot_mapping=slot_mapping[:batch_size],
+                            seq_lens=None,
+                            seq_lens_tensor=seq_lens[:batch_size],
+                            max_query_len=None,
+                            max_prefill_seq_len=0,
+                            max_decode_seq_len=self.max_seq_len_to_capture,
+                            query_start_loc=None,
+                            seq_start_loc=None,
+                            context_lens_tensor=None,
+                            block_tables=block_tables[:batch_size],
+                            use_cuda_graph=True,
+                        )
+
+                    if self.lora_config:
+                        lora_mapping = LoRAMapping(
+                            [0] * batch_size,
+                            [0] * batch_size,
+                        )
+                        self.set_active_loras(set(), lora_mapping)
+
+                    graph_runner = CUDAGraphRunner(
+                        self.model, self.attn_backend.get_name())
+
+                    if self.attn_backend.get_name() == "flashinfer":
+                        graph_runner.flashinfer_indptr_buffer = indptr_buffer
+                        graph_runner.flashinfer_indices_buffer = indices_buffer
+                        graph_runner.flashinfer_last_page_len_buffer = \
+                            last_page_len_buffer
+                        graph_runner.flashinfer_decode_workspace_buffer = \
+                                decode_workspace_buffer
+                        graph_runner.flashinfer_decode_wrapper = \
+                            decode_wrapper
+
+                    graph_runner.capture(
+                        input_tokens[:batch_size],
+                        input_positions[:batch_size],
+                        hidden_or_intermediate_states[
+                            virtual_engine]  # type: ignore
+                        [:batch_size]
+                        if hidden_or_intermediate_states[virtual_engine]
+                        is not None else None,
+                        intermediate_inputs[:batch_size]
+                        if intermediate_inputs is not None else None,
+                        kv_caches[virtual_engine],
+                        attn_metadata,
+                        memory_pool=self.graph_memory_pool,
+                        stream=graph_capture_context.stream,
                     )
-
-                if self.lora_config:
-                    lora_mapping = LoRAMapping(
-                        [0] * batch_size,
-                        [0] * batch_size,
-                    )
-                    self.set_active_loras(set(), lora_mapping)
-
-                graph_runner = CUDAGraphRunner(self.model,
-                                               self.attn_backend.get_name())
-
-                if self.attn_backend.get_name() == "flashinfer":
-                    graph_runner.flashinfer_indptr_buffer = indptr_buffer
-                    graph_runner.flashinfer_indices_buffer = indices_buffer
-                    graph_runner.flashinfer_last_page_len_buffer = \
-                        last_page_len_buffer
-                    graph_runner.flashinfer_decode_workspace_buffer = \
-                            decode_workspace_buffer
-                    graph_runner.flashinfer_decode_wrapper = \
-                        decode_wrapper
-
-                graph_runner.capture(
-                    input_tokens[:batch_size],
-                    input_positions[:batch_size],
-                    hidden_states[:batch_size]
-                    if hidden_states is not None else None,
-                    kv_caches,
-                    attn_metadata,
-                    memory_pool=self.graph_memory_pool,
-                    stream=graph_capture_context.stream,
-                )
-                self.graph_memory_pool = graph_runner.graph.pool()
-                self.graph_runners[batch_size] = graph_runner
+                    self.graph_memory_pool = graph_runner.graph.pool()
+                    self.graph_runners[virtual_engine][batch_size] = (
+                        graph_runner)
 
         end_time = time.perf_counter()
         elapsed_time = end_time - start_time
@@ -1047,6 +1083,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
     ) -> ModelInputForGPUWithSamplingMetadata:
         """Prepare the model input based on a given sequence group, including
         metadata for the sampling step.
@@ -1072,15 +1109,17 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
                      if seq_group_metadata_list else None)
         return dataclasses.replace(model_input,
                                    sampling_metadata=sampling_metadata,
-                                   is_prompt=is_prompt)
+                                   is_prompt=is_prompt,
+                                   virtual_engine=virtual_engine)
 
     @torch.inference_mode()
     def execute_model(
         self,
         model_input: ModelInputForGPUWithSamplingMetadata,
         kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
+    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
         if num_steps > 1:
             raise ValueError("num_steps > 1 is not supported in ModelRunner")
 
@@ -1124,27 +1163,34 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
         assert model_input.attn_metadata is not None
         prefill_meta = model_input.attn_metadata.prefill_metadata
         decode_meta = model_input.attn_metadata.decode_metadata
+        # TODO(andoorve): We can remove this once all
+        # virtual engines share the same kv cache.
+        virtual_engine = model_input.virtual_engine
         if prefill_meta is None and decode_meta.use_cuda_graph:
             assert model_input.input_tokens is not None
             graph_batch_size = model_input.input_tokens.shape[0]
-            model_executable = self.graph_runners[graph_batch_size]
+            model_executable = self.graph_runners[virtual_engine][
+                graph_batch_size]
         else:
             model_executable = self.model
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        hidden_states = model_executable(
+        hidden_or_intermediate_states = model_executable(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
             kv_caches=kv_caches,
             attn_metadata=model_input.attn_metadata,
+            intermediate_tensors=intermediate_tensors,
             **multi_modal_kwargs,
         )
 
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_states,
+        # Compute the logits in the last pipeline stage.
+        if not get_pp_group().is_last_rank:
+            return hidden_or_intermediate_states
+
+        logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
-        # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
             return []
 
@@ -1159,9 +1205,12 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
             assert model_input.sampling_metadata is not None
             indices = model_input.sampling_metadata.selected_token_indices
             if model_input.is_prompt:
-                hidden_states = hidden_states.index_select(0, indices)
+                hidden_states = hidden_or_intermediate_states.index_select(
+                    0, indices)
             elif decode_meta.use_cuda_graph:
-                hidden_states = hidden_states[:len(indices)]
+                hidden_states = hidden_or_intermediate_states[:len(indices)]
+            else:
+                hidden_states = hidden_or_intermediate_states
 
             output.hidden_states = hidden_states
 
@@ -1195,13 +1244,15 @@ class CUDAGraphRunner:
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        hidden_states: Optional[torch.Tensor],
+        hidden_or_intermediate_states: Optional[Union[IntermediateTensors,
+                                                      torch.Tensor]],
+        intermediate_inputs: Optional[IntermediateTensors],
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         memory_pool: Optional[Tuple[int, int]],
         stream: torch.cuda.Stream,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, IntermediateTensors]:
         assert self._graph is None
         # Run the model a few times without capturing the graph.
         # This is to make sure that the captured graph does not include the
@@ -1213,6 +1264,7 @@ class CUDAGraphRunner:
                 positions,
                 kv_caches,
                 attn_metadata,
+                intermediate_inputs,
                 **kwargs,
             )
         torch.cuda.synchronize()
@@ -1220,18 +1272,27 @@ class CUDAGraphRunner:
         # Capture the graph.
         self._graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
-            output_hidden_states = self.model(
+            output_hidden_or_intermediate_states = self.model(
                 input_ids,
                 positions,
                 kv_caches,
                 attn_metadata,
+                intermediate_inputs,
                 **kwargs,
             )
-            if hidden_states is not None:
-                hidden_states.copy_(output_hidden_states)
+            if hidden_or_intermediate_states is not None:
+                if get_pp_group().is_last_rank:
+                    hidden_or_intermediate_states.copy_(
+                        output_hidden_or_intermediate_states)
+                else:
+                    for key in hidden_or_intermediate_states.tensors:
+                        hidden_or_intermediate_states[key].copy_(
+                            output_hidden_or_intermediate_states[key])
             else:
-                hidden_states = output_hidden_states
-            del output_hidden_states
+                hidden_or_intermediate_states = (
+                    output_hidden_or_intermediate_states)
+
+            del output_hidden_or_intermediate_states
             # make sure `output_hidden_states` is deleted
             # in the graph's memory pool
             gc.collect()
@@ -1255,8 +1316,15 @@ class CUDAGraphRunner:
                 attn_metadata.decode_metadata.seq_lens_tensor,
                 "block_tables": attn_metadata.decode_metadata.block_tables,
             }
-        self.output_buffers = {"hidden_states": hidden_states}
-        return hidden_states
+        if intermediate_inputs is not None:
+            self.input_buffers.update(intermediate_inputs.tensors)
+        if get_pp_group().is_last_rank:
+            self.output_buffers = {
+                "hidden_states": hidden_or_intermediate_states
+            }
+        else:
+            self.output_buffers = hidden_or_intermediate_states
+        return hidden_or_intermediate_states
 
     def forward(
         self,
@@ -1264,6 +1332,7 @@ class CUDAGraphRunner:
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
         **kwargs,
     ) -> torch.Tensor:
         # KV caches are fixed tensors, so we don't need to copy them.
@@ -1280,11 +1349,18 @@ class CUDAGraphRunner:
                 non_blocking=True)
             self.input_buffers["block_tables"].copy_(
                 attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        if intermediate_tensors is not None:
+            for key in intermediate_tensors.tensors:
+                self.input_buffers[key].copy_(intermediate_tensors[key],
+                                              non_blocking=True)
         # Run the graph.
         self.graph.replay()
 
         # Return the output tensor.
-        return self.output_buffers["hidden_states"]
+        if get_pp_group().is_last_rank:
+            return self.output_buffers["hidden_states"]
+
+        return self.output_buffers
 
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 959cfc0b9..f66bb4662 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -5,7 +5,8 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
 
 import torch
 
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
@@ -137,6 +138,7 @@ class ModelRunnerBase(ABC, Generic[T]):
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
     ) -> T:
         """
         Prepare the inputs to ModelRunnerBase.execute_model from an execution
@@ -150,6 +152,7 @@ class ModelRunnerBase(ABC, Generic[T]):
         self,
         model_input: T,
         kv_caches: Optional[List[torch.Tensor]],
+        intermediate_tensors: Optional[IntermediateTensors],
         num_steps: int = 1,
     ) -> Optional[List[SamplerOutput]]:
         """
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 2ccf4a50a..ab8e48528 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -9,7 +9,8 @@ from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
 
@@ -175,6 +176,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
     ) -> ModelInputForNeuron:
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
@@ -207,6 +209,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         self,
         model_input: ModelInputForNeuron,
         kv_caches: Optional[List[torch.Tensor]] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
     ) -> Optional[List[SamplerOutput]]:
         if num_steps > 1:
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 307c107dd..f7525e049 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -80,7 +80,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         return False
 
     @property
-    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
         return None
 
     @torch.inference_mode()
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index cc27d06b5..5b5728290 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -59,9 +59,9 @@ class Worker(LocalOrDistributedWorkerBase):
         self.lora_config = lora_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
-        if self.is_driver_worker:
-            assert self.rank == 0, "The driver worker must have rank 0."
-
+        if parallel_config and is_driver_worker:
+            assert rank % parallel_config.tensor_parallel_size == 0, \
+                   "Driver worker should be rank 0 of tensor parallel group."
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
@@ -99,9 +99,9 @@ class Worker(LocalOrDistributedWorkerBase):
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine: CacheEngine
+        self.cache_engine: List[CacheEngine]
         # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.gpu_cache: Optional[List[torch.tensor]] = None
+        self.gpu_cache: Optional[List[List[torch.tensor]]] = None
 
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":
@@ -217,10 +217,15 @@ class Worker(LocalOrDistributedWorkerBase):
 
     def _init_cache_engine(self):
         assert self.cache_config.num_gpu_blocks is not None
-        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config,
-                                        self.device_config)
-        self.gpu_cache = self.cache_engine.gpu_cache
+        self.cache_engine = [
+            CacheEngine(self.cache_config, self.model_config,
+                        self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.gpu_cache = [
+            self.cache_engine[ve].gpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
 
     def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
@@ -234,12 +239,13 @@ class Worker(LocalOrDistributedWorkerBase):
         return self.parallel_config.tensor_parallel_size > 1
 
     @property
-    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
         return self.gpu_cache
 
     @torch.inference_mode()
     def prepare_worker_input(
             self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        virtual_engine = execute_model_req.virtual_engine
         num_seq_groups = len(execute_model_req.seq_group_metadata_list)
         # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
         # they contain parameters to launch cudamemcpyasync.
@@ -261,20 +267,24 @@ class Worker(LocalOrDistributedWorkerBase):
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
         )
 
     @torch.inference_mode()
     def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
         # Issue cache operations.
         if (worker_input.blocks_to_swap_in is not None
                 and worker_input.blocks_to_swap_in.numel() > 0):
-            self.cache_engine.swap_in(worker_input.blocks_to_swap_in)
+            self.cache_engine[virtual_engine].swap_in(
+                worker_input.blocks_to_swap_in)
         if (worker_input.blocks_to_swap_out is not None
                 and worker_input.blocks_to_swap_out.numel() > 0):
-            self.cache_engine.swap_out(worker_input.blocks_to_swap_out)
+            self.cache_engine[virtual_engine].swap_out(
+                worker_input.blocks_to_swap_out)
         if (worker_input.blocks_to_copy is not None
                 and worker_input.blocks_to_copy.numel() > 0):
-            self.cache_engine.copy(worker_input.blocks_to_copy)
+            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index d867e15bd..118173a4c 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -6,10 +6,11 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
 
-from vllm.distributed import broadcast_tensor_dict
+from vllm.distributed import broadcast_tensor_dict, get_pp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
+                           SamplerOutput)
 from vllm.utils import (enable_trace_function_call_for_thread, is_hip,
                         update_environment_variables)
 from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
@@ -124,6 +125,7 @@ class WorkerInput:
     blocks_to_swap_in: Optional[torch.Tensor] = None
     blocks_to_swap_out: Optional[torch.Tensor] = None
     blocks_to_copy: Optional[torch.Tensor] = None
+    virtual_engine: int = 0
 
     @classmethod
     def from_broadcasted_tensor_dict(
@@ -139,6 +141,7 @@ class WorkerInput:
             blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
             blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
             blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
+            virtual_engine=tensor_dict["virtual_engine"],
         )
 
     def as_broadcastable_tensor_dict(
@@ -151,6 +154,7 @@ class WorkerInput:
             "blocks_to_swap_in": self.blocks_to_swap_in,
             "blocks_to_swap_out": self.blocks_to_swap_out,
             "blocks_to_copy": self.blocks_to_copy,
+            "virtual_engine": self.virtual_engine,
         }
 
         return tensor_dict
@@ -181,11 +185,13 @@ class LocalOrDistributedWorkerBase(WorkerBase):
 
     @property
     @abstractmethod
-    def kv_cache(self) -> Optional[List[torch.Tensor]]:
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
         """
-        Get the kv cache to pass to the worker's model runner. Used by the
-        default `execute_model`. If the worker's model runner does not follow
-        the ModelRunnerBase interface, then inherit from WorkerBase instead.
+        Gets the list of kv caches to pass to the worker's model runner. Each
+        element in the list is a kv cache corresponding to a particular virtual
+        engine (PP stream). Used by the default `execute_model`. If the worker's
+        model runner does not follow the ModelRunnerBase interface, then inherit
+        from WorkerBase instead.
         """
         raise NotImplementedError
 
@@ -227,7 +233,8 @@ class LocalOrDistributedWorkerBase(WorkerBase):
                 execute_model_req=execute_model_req)
             model_input: ModelRunnerInputBase = (
                 self.model_runner.prepare_model_input(
-                    execute_model_req.seq_group_metadata_list))
+                    execute_model_req.seq_group_metadata_list,
+                    execute_model_req.virtual_engine))
             num_steps = execute_model_req.num_steps
 
             if self.do_metadata_broadcast:
@@ -255,8 +262,23 @@ class LocalOrDistributedWorkerBase(WorkerBase):
         if worker_input.num_seq_groups == 0:
             return []
 
-        return self.model_runner.execute_model(model_input, self.kv_cache,
-                                               num_steps)
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict())
+
+        output = self.model_runner.execute_model(
+            model_input, self.kv_cache[worker_input.virtual_engine]
+            if self.kv_cache is not None else None, intermediate_tensors,
+            num_steps)
+
+        if not get_pp_group().is_last_rank:
+            get_pp_group().send_tensor_dict(output.tensors)
+            return [None]
+
+        # Worker only supports single-step execution. Wrap the output in a
+        # list to conform to interface.
+        return output
 
 
 class WorkerWrapperBase:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 99fd7da5e..73b771c43 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -12,7 +12,8 @@ from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData,
+                           SequenceGroupMetadata)
 from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
 from vllm.worker.model_runner_base import (
@@ -190,6 +191,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
     ) -> ModelInputForXPU:
         multi_modal_input = None
         if self.is_driver_worker:
@@ -334,6 +336,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         self,
         model_input: ModelInputForXPU,
         kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
     ) -> Optional[List[SamplerOutput]]:
         if num_steps > 1:
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 773ee9f81..7a51f2b2c 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -85,8 +85,8 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine: CacheEngine
-        self.gpu_cache: List[torch.Tensor]
+        self.cache_engine: List[CacheEngine]
+        self.gpu_cache: Optional[List[List[torch.Tensor]]]
 
     def init_device(self) -> None:
         if self.device_config.device.type == "xpu" and is_xpu():
-- 
GitLab


From 4d26d806e1eeff9f90351008e8e36bcf0d35678f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:14:22 -0400
Subject: [PATCH 240/376] Update conftest.py (#6076)

---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c3bd78263..fd088d566 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -65,7 +65,7 @@ class _ImageAssetPrompts(TypedDict):
     cherry_blossom: str
 
 
-class _ImageAssets(UserList[ImageAsset]):
+class _ImageAssets(UserList):
 
     def __init__(self) -> None:
         super().__init__(
-- 
GitLab


From 7c008c51a9aa4c0d53d09ab3a1ba61ecec354565 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Tue, 2 Jul 2024 17:54:35 -0400
Subject: [PATCH 241/376] [ Misc ] Refactor MoE to isolate Fp8 From Mixtral
 (#5970)

Co-authored-by: Robert Shaw <rshaw@neuralmagic>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 ...xtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml |  11 +
 .../Mixtral-8x7B-Instruct-v0.1-FP8.yaml       |  11 +
 .../configs/Qwen2-57B-A14-Instruct.yaml       |  11 +
 .../lm-eval-harness/configs/models-large.txt  |   1 +
 tests/kernels/test_moe.py                     |   4 +-
 .../layers/fused_moe/__init__.py              |   4 +
 vllm/model_executor/layers/fused_moe/layer.py | 197 +++++++++++++
 .../model_executor/layers/quantization/fp8.py | 192 +++++++++++-
 vllm/model_executor/models/mixtral.py         | 276 +++---------------
 vllm/model_executor/models/qwen2_moe.py       | 136 ++++-----
 10 files changed, 537 insertions(+), 306 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
 create mode 100644 vllm/model_executor/layers/fused_moe/layer.py

diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
new file mode 100644
index 000000000..75a24e408
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
+model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.86
+  - name: "exact_match,flexible-extract"
+    value: 0.86
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
new file mode 100644
index 000000000..436ec2192
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
+model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.624
+  - name: "exact_match,flexible-extract"
+    value: 0.624
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
new file mode 100644
index 000000000..45d5efc88
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
+model_name: "Qwen/Qwen2-57B-A14B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.792
+  - name: "exact_match,flexible-extract"
+    value: 0.824
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
index 127ec5d97..2007dd2e1 100644
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -1,2 +1,3 @@
 Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
+Qwen2-57B-A14-Instruct.yaml
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 22b6769ac..2f9eee420 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -77,8 +77,8 @@ def test_mixtral_moe(dtype: torch.dtype):
     for i in range(config.num_local_experts):
         weights = (hf_moe.experts[i].w1.weight.data,
                    hf_moe.experts[i].w3.weight.data)
-        vllm_moe.w13_weight[i][:] = torch.cat(weights, dim=0)
-        vllm_moe.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
+        vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
+        vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
 
     # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
     hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 1dafae503..db837231c 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,7 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase)
 
 __all__ = [
     "fused_moe",
@@ -7,4 +9,6 @@ __all__ = [
     "fused_experts",
     "get_config_file_name",
     "grouped_topk",
+    "FusedMoE",
+    "FusedMoEMethodBase",
 ]
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
new file mode 100644
index 000000000..73cfcd7fc
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -0,0 +1,197 @@
+from abc import abstractmethod
+from typing import Optional
+
+import torch
+
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+
+class FusedMoEMethodBase(QuantizeMethodBase):
+
+    @abstractmethod
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              router_logits: torch.Tensor,
+              top_k: int,
+              renormalize: bool = True) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase):
+    """MoE method without quantization."""
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                    2 * intermediate_size,
+                                                    hidden_size,
+                                                    dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                   hidden_size,
+                                                   intermediate_size,
+                                                   dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              router_logits: torch.Tensor,
+              top_k: int,
+              renormalize: bool = True) -> torch.Tensor:
+
+        return fused_moe(x,
+                         layer.w13_weight,
+                         layer.w2_weight,
+                         router_logits,
+                         top_k,
+                         renormalize=renormalize,
+                         inplace=True)
+
+
+class FusedMoE(torch.nn.Module):
+    """FusedMoE layer for MoE models.
+
+    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    w13) and RowParallelLinear weights (down_proj/ w2).
+
+    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
+    copy that naming convention here and handle any remapping in the
+    load_weights function in each model implementation.
+
+    Args:
+        num_experts: Number of experts in the model
+        top_k: Number of experts selected for each token
+        hidden_size: Input hidden state size of the transformer
+        intermediate_size: Intermediate size of the experts
+        params_dtype: Data type for the parameters.
+        reduce_results: Whether to all all_reduce on the output of the layer
+        renomalize: Whether to renormalize the logits in the fused_moe kernel
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        renormalize: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+    ):
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        self.tp_size = (tp_size if tp_size is not None else
+                        get_tensor_model_parallel_world_size())
+        self.top_k = top_k
+        self.num_experts = num_experts
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        self.reduce_results = reduce_results
+        self.renormalize = renormalize
+
+        if quant_config is None:
+            self.quant_method: Optional[QuantizeMethodBase] = (
+                UnquantizedFusedMoEMethod())
+        else:
+            self.quant_method = quant_config.get_quant_method(self)
+        assert self.quant_method is not None
+
+        self.quant_method.create_weights(
+            layer=self,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=self.intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=self.weight_loader)
+
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, weight_name: str,
+                      shard_id: int, expert_id: int):
+        param_data = param.data
+
+        # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral.
+        # Follow up PR to enable fp8 for other MoE models.
+        if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
+            if param_data[expert_id] != 1 and (param_data[expert_id] -
+                                               loaded_weight).abs() > 1e-5:
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param_data[expert_id]} "
+                    f"vs. {loaded_weight}")
+            param_data[expert_id] = loaded_weight
+        # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral.
+        # Follow up PR to enable fp8 for other MoE models.
+        elif "weight_scale" in weight_name:
+            # We have to keep the weight scales of w1 and w3 because
+            # we need to re-quantize w1/w3 weights after weight loading.
+            assert "w1" in weight_name or "w3" in weight_name
+            shard_id = 0 if "w1" in weight_name else 1
+            param_data[expert_id][shard_id] = loaded_weight
+        else:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = self.intermediate_size_per_partition
+            shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+
+            # w1, gate_proj case: Load into first shard of w13.
+            if shard_id == 0:
+                param_data[expert_id,
+                           0:shard_size, :] = loaded_weight[shard, :]
+            # w3, up_proj case: Load into second shard of w13.
+            elif shard_id == 2:
+                param_data[expert_id, shard_size:2 *
+                           shard_size, :] = loaded_weight[shard, :]
+            # w2, down_proj case: Load into only shard of w2.
+            elif shard_id == 1:
+                param_data[expert_id, :, :] = loaded_weight[:, shard]
+            else:
+                raise ValueError(
+                    f"Shard id must be in [0,1,2] but got {shard_id}")
+
+    def forward(self, hidden_states: torch.Tensor,
+                router_logits: torch.Tensor):
+        assert self.quant_method is not None
+
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply(
+            self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            renormalize=self.renormalize)
+
+        if self.reduce_results and self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5d503a221..dc2ca35c6 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -6,6 +6,8 @@ from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
+                                                  fused_moe)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
@@ -71,7 +73,9 @@ class Fp8Config(QuantizationConfig):
 
         if isinstance(layer, LinearBase):
             return Fp8LinearMethod(self)
-        if isinstance(layer, Attention):
+        elif isinstance(layer, FusedMoE):
+            return Fp8MoEMethod(self)
+        elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
 
@@ -270,6 +274,187 @@ class Fp8LinearMethod(LinearMethodBase):
         return torch.narrow(output, 0, 0, x.shape[0])
 
 
+class Fp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
+                       intermediate_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+
+        layer.process_after_load = True
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                    2 * intermediate_size,
+                                                    hidden_size,
+                                                    dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                   hidden_size,
+                                                   intermediate_size,
+                                                   dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                  2,
+                                                  dtype=torch.float32),
+                                       requires_grad=False)
+        layer.register_parameter("w13_scale", w13_scale)
+
+        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                 dtype=torch.float32),
+                                      requires_grad=False)
+        layer.register_parameter("w2_scale", w2_scale)
+
+        # If loading fp8 checkpoint, pass the weight loaders.
+        # If loading an fp16 checkpoint, do not (we will quantize in
+        #   process_weights_after_loading()
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            set_weight_attrs(w13_scale, extra_weight_attrs)
+            set_weight_attrs(w2_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.quant_config.activation_scheme == "static":
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "Found static activation scheme for checkpoint that "
+                    "was not serialized fp8.")
+
+            a13_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                      dtype=torch.float32),
+                                           requires_grad=False)
+            layer.register_parameter("a13_scale", a13_scale)
+            set_weight_attrs(a13_scale, extra_weight_attrs)
+
+            a2_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                     dtype=torch.float32),
+                                          requires_grad=False)
+            layer.register_parameter("a2_scale", a2_scale)
+            set_weight_attrs(a2_scale, extra_weight_attrs)
+        else:
+            layer.a13_scale = None
+            layer.a2_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if (not hasattr(layer, "process_after_load")
+                or not layer.process_after_load):
+            return
+
+        # If checkpoint is fp16, quantize in place.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            w13_weight = torch.empty_like(layer.w13_weight.data,
+                                          dtype=torch.float8_e4m3fn)
+            w2_weight = torch.empty_like(layer.w2_weight.data,
+                                         dtype=torch.float8_e4m3fn)
+
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            layer.w13_scale = torch.nn.Parameter(torch.ones(
+                layer.num_experts,
+                dtype=torch.float32,
+                device=w13_weight.device),
+                                                 requires_grad=False)
+            for expert in range(layer.num_experts):
+                w13_weight[expert, :, :], layer.w13_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w13_weight.data[expert, :, :])
+                w2_weight[expert, :, :], layer.w2_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w2_weight.data[expert, :, :])
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+            return
+
+        # If checkpoint is fp8, we need to handle that the
+        # MoE kernels require single activation scale and single weight
+        # scale for w13 per expert.
+        else:
+            # Fp8 moe kernels require a single activation scale.
+            # We take the max of all the scales in case they differ.
+            if self.quant_config.activation_scheme == "static":
+                if layer.a13_scale is None or layer.a2_scale is None:
+                    raise ValueError(
+                        "QuantConfig has static quantization, but found "
+                        "activation scales are None.")
+                if (not all_close_1d(layer.a13_scale)
+                        or not all_close_1d(layer.a2_scale)):
+                    print_warning_once(
+                        "Found input_scales that are not equal for "
+                        "fp8 MoE layer. Using the maximum across experts "
+                        "for each layer. ")
+                layer.a13_scale = torch.nn.Parameter(layer.a13_scale.max(),
+                                                     requires_grad=False)
+                layer.a2_scale = torch.nn.Parameter(layer.a2_scale.max(),
+                                                    requires_grad=False)
+
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max then dequant and requant each expert.
+            assert layer.w13_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_scale.max(dim=1).values
+            for expert_id in range(layer.num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :] = per_tensor_quantize(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+
+            layer.w13_scale = torch.nn.Parameter(max_w13_scales,
+                                                 requires_grad=False)
+            return
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              router_logits: torch.Tensor,
+              top_k: int,
+              renormalize: bool = True) -> torch.Tensor:
+
+        return fused_moe(x,
+                         layer.w13_weight,
+                         layer.w2_weight,
+                         router_logits,
+                         top_k,
+                         renormalize=renormalize,
+                         inplace=True,
+                         use_fp8=True,
+                         w1_scale=layer.w13_scale,
+                         w2_scale=layer.w2_scale,
+                         a1_scale=layer.a13_scale,
+                         a2_scale=layer.a2_scale)
+
+
 class Fp8KVCacheMethod(QuantizeMethodBase):
     """Supports loading kv-cache scaling factors from FP8 checkpoints.
     """
@@ -321,3 +506,8 @@ def per_tensor_dequantize(
     fake_qweight = tensor.to(torch.float16)
     dq_weight = fake_qweight * inv_scale
     return dq_weight
+
+
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 05c36b9c0..5144e7ea4 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -27,13 +27,10 @@ import torch
 from torch import nn
 from transformers import MixtralConfig
 
-from vllm import _custom_ops as ops
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
@@ -41,16 +38,12 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
-                                                         per_tensor_dequantize,
-                                                         per_tensor_quantize)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
@@ -66,227 +59,40 @@ class MixtralMoE(nn.Module):
     across ranks.
     """
 
-    def __init__(
-        self,
-        num_experts: int,
-        top_k: int,
-        hidden_size: int,
-        intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+    def __init__(self,
+                 num_experts: int,
+                 top_k: int,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 tp_size: Optional[int] = None):
         super().__init__()
-        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
-        self.num_total_experts = num_experts
-        self.top_k = top_k
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size // self.tp_size
-        self.quant_config = quant_config
-
-        # FIXME(pcmoritz): Make this more general to support different
-        # quantization schemes
-        self.use_fp8 = isinstance(quant_config, Fp8Config)
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-        self.params_dtype = params_dtype
 
         # Gate always runs at half / full precision for now.
-        self.gate = ReplicatedLinear(self.hidden_size,
-                                     self.num_total_experts,
+        self.gate = ReplicatedLinear(hidden_size,
+                                     num_experts,
                                      bias=False,
-                                     params_dtype=self.params_dtype,
+                                     params_dtype=params_dtype,
                                      quant_config=None)
 
-        if self.use_fp8 and self.quant_config.is_checkpoint_fp8_serialized:
-            params_dtype = torch.float8_e4m3fn
-
-        self.w13_weight = nn.Parameter(torch.empty(self.num_total_experts,
-                                                   2 * self.intermediate_size,
-                                                   self.hidden_size,
-                                                   dtype=params_dtype),
-                                       requires_grad=False)
-        self.w2_weight = nn.Parameter(torch.empty(self.num_total_experts,
-                                                  self.hidden_size,
-                                                  self.intermediate_size,
-                                                  dtype=params_dtype),
-                                      requires_grad=False)
-
-        set_weight_attrs(self.w13_weight, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.w2_weight, {
-            "weight_loader": self.weight_loader,
-        })
-
-        # Used for fp8.
-        self.w13_scale = None
-        self.w2_scale = None
-        self.a13_scale = None
-        self.a2_scale = None
-
-        if self.use_fp8:
-            # WEIGHT_SCALE (for fp8)
-            # Allocate 2 scales for w1 and w3 respectively.
-            # They will be combined to a single scale after weight loading.
-            self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                     2,
-                                                     dtype=torch.float32),
-                                          requires_grad=False)
-            self.w2_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                    dtype=torch.float32),
-                                         requires_grad=False)
-
-            # If loading fp8 checkpoint, pass the weight loaders.
-            # If loading an fp16 checkpoint, do not (we will quantize in
-            #   process_weights_after_loading()
-            if quant_config.is_checkpoint_fp8_serialized:
-                set_weight_attrs(self.w13_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-                set_weight_attrs(self.w2_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-
-            # INPUT_SCALE (for fp8)
-            if quant_config.activation_scheme == "static":
-                if not quant_config.is_checkpoint_fp8_serialized:
-                    raise ValueError(
-                        "Found static activation scheme for checkpoint that "
-                        "was not serialized fp8.")
-                self.a13_scale = nn.Parameter(torch.ones(
-                    self.num_total_experts, dtype=torch.float32),
-                                              requires_grad=False)
-                self.a2_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
-
-                set_weight_attrs(self.a13_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-                set_weight_attrs(self.a2_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-
-    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      weight_name: str, expert_id: int):
-        tp_rank = get_tensor_model_parallel_rank()
-        param_data = param.data
-        shard_size = self.intermediate_size
-        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
-        if weight_name.endswith("w1.weight"):
-            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("w3.weight"):
-            param_data[expert_id,
-                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("w2.weight"):
-            param_data[expert_id, :, :] = loaded_weight[:, shard]
-
-        # Loading scales
-        if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
-            if param_data[expert_id] != 1 and (param_data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param_data[expert_id]} "
-                    f"vs. {loaded_weight}")
-            param_data[expert_id] = loaded_weight
-        elif "weight_scale" in weight_name:
-            # We have to keep the weight scales of w1 and w3 because
-            # we need to re-quantize w1/w3 weights after weight loading.
-            assert "w1" in weight_name or "w3" in weight_name
-            shard_id = 0 if "w1" in weight_name else 1
-            param_data[expert_id][shard_id] = loaded_weight
-
-    def process_weights_after_loading(self):
-        # Fp8 is the only case where we need to process after loading.
-        if not self.use_fp8:
-            return
-
-        # If checkpoint is fp16, quantize here.
-        if not self.quant_config.is_checkpoint_fp8_serialized:
-            w13_weight = torch.empty_like(self.w13_weight.data,
-                                          dtype=torch.float8_e4m3fn)
-            w2_weight = torch.empty_like(self.w2_weight.data,
-                                         dtype=torch.float8_e4m3fn)
-
-            # Re-initialize w13_scale because we directly quantize
-            # merged w13 weights and generate a single scaling factor.
-            self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                     dtype=torch.float32),
-                                          requires_grad=False)
-            for expert in range(self.num_total_experts):
-                w13_weight[expert, :, :], self.w13_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        self.w13_weight.data[expert, :, :])
-                w2_weight[expert, :, :], self.w2_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        self.w2_weight.data[expert, :, :])
-            self.w13_weight = nn.Parameter(w13_weight, requires_grad=False)
-            self.w2_weight = nn.Parameter(w2_weight, requires_grad=False)
-
-        else:
-            # If checkpoint is fp8 + static, cleanup input_scales.
-            #   Since state_dict has an input_scale per expert but our kernels
-            #   are passed one input_scale shared across all experts.
-            if self.quant_config.activation_scheme == "static":
-                if self.a13_scale is None or self.a2_scale is None:
-                    raise ValueError(
-                        "QuantConfig has static quantization, but found "
-                        "activation scales are None.")
-
-                if (not all_close_1d(self.a13_scale)
-                        or not all_close_1d(self.a2_scale)):
-                    print_warning_once(
-                        "Found input_scales that are not equal for "
-                        "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer. ")
-
-                self.a13_scale = nn.Parameter(self.a13_scale.max(),
-                                              requires_grad=False)
-                self.a2_scale = nn.Parameter(self.a2_scale.max(),
-                                             requires_grad=False)
-
-            assert self.w13_scale is not None
-            shard_size = self.intermediate_size
-            max_w13_scales = self.w13_scale.max(dim=1).values
-            for expert_id in range(self.num_total_experts):
-                start = 0
-                for shard_id in range(2):
-                    dq_weight = per_tensor_dequantize(
-                        self.w13_weight[expert_id][start:start +
-                                                   shard_size, :],
-                        self.w13_scale[expert_id][shard_id])
-                    self.w13_weight[expert_id][
-                        start:start + shard_size, :] = per_tensor_quantize(
-                            dq_weight, max_w13_scales[expert_id])
-                    start += shard_size
-
-            self.w13_scale = nn.Parameter(max_w13_scales, requires_grad=False)
+        self.experts = FusedMoE(num_experts=num_experts,
+                                top_k=top_k,
+                                hidden_size=hidden_size,
+                                intermediate_size=intermediate_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                tp_size=tp_size)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w13_weight,
-                                        self.w2_weight,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=True,
-                                        inplace=True,
-                                        use_fp8=self.use_fp8,
-                                        w1_scale=self.w13_scale,
-                                        w2_scale=self.w2_scale,
-                                        a1_scale=self.a13_scale,
-                                        a2_scale=self.a2_scale)
-
-        if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
+        final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(num_tokens, hidden_size)
 
 
@@ -566,25 +372,28 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
 
         expert_params_mapping = [
             # These are the weight scales for the experts
-            # (param_name, weight_name, expert_id)
-            ("w13_scale" if weight_name in ["w1", "w3"] else "w2_scale",
-             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id)
-            for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_scale"
+             if weight_name in ["w1", "w3"] else "experts.w2_scale",
+             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id,
+             shard_id) for expert_id in range(self.config.num_local_experts)
+            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
         ] + [
             # These are the weights for the experts
             # (param_name, weight_name, expert_id)
-            ("w13_weight" if weight_name in ["w1", "w3"] else "w2_weight",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            ("experts.w13_weight"
+             if weight_name in ["w1", "w3"] else "experts.w2_weight",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
             for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
+            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
         ] + [
             # These are the activation scales for the experts
             # (param_name, weight_name, expert_id)
-            ("a13_scale" if weight_name in ["w1", "w3"] else "a2_scale",
-             f"experts.{expert_id}.{weight_name}.input_scale", expert_id)
-            for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
+            ("experts.a13_scale"
+             if weight_name in ["w1", "w3"] else "experts.a2_scale",
+             f"experts.{expert_id}.{weight_name}.input_scale", expert_id,
+             shard_id) for expert_id in range(self.config.num_local_experts)
+            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
         ]
 
         params_dict = dict(self.named_parameters())
@@ -604,7 +413,8 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for param_name, weight_name, expert_id in expert_params_mapping:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
@@ -613,6 +423,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
                     weight_loader(param,
                                   loaded_weight,
                                   weight_name,
+                                  shard_id=shard_id,
                                   expert_id=expert_id)
                     break
                 else:
@@ -637,8 +448,3 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
-
-
-def all_close_1d(x: torch.Tensor) -> bool:
-    assert len(x.shape) == 1
-    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index b3e7dfef9..8decb4464 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -31,11 +31,10 @@ from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
+from vllm.distributed import (get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -93,28 +92,23 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.n_routed_experts = config.num_experts
-        self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.n_routed_experts:
+
+        if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.n_routed_experts}.")
-
-        self.experts = nn.ModuleList([
-            Qwen2MoeMLP(hidden_size=config.hidden_size,
-                        intermediate_size=config.moe_intermediate_size,
-                        hidden_act=config.hidden_act,
-                        quant_config=quant_config,
-                        reduce_results=False)
-            for idx in range(self.n_routed_experts)
-        ])
-        self.pack_params()
+                f"the number of experts {config.num_experts}.")
+
+        self.experts = FusedMoE(num_experts=config.num_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config)
 
         self.gate = ReplicatedLinear(config.hidden_size,
-                                     self.n_routed_experts,
+                                     config.num_experts,
                                      bias=False,
                                      quant_config=None)
         if config.shared_expert_intermediate_size > 0:
@@ -131,25 +125,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
                                                   1,
                                                   bias=False)
 
-    def pack_params(self):
-        w1 = []
-        w2 = []
-        for expert in self.experts:
-            w1.append(expert.gate_up_proj.weight)
-            w2.append(expert.down_proj.weight)
-        self.w1 = torch._utils._flatten_dense_tensors(w1)
-        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
-        for data, param in zip(w1s, w1):
-            param.data = data
-        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
-
-        self.w2 = torch._utils._flatten_dense_tensors(w2)
-        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
-        for data, param in zip(w2s, w2):
-            param.data = data
-
-        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
-
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
@@ -162,18 +137,13 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
 
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w1,
-                                        self.w2,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=self.config.norm_topk_prob,
-                                        inplace=True)
-
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
         if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
-        final_hidden_states = tensor_model_parallel_all_reduce(
-            final_hidden_states)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
 
         return final_hidden_states.view(num_tokens, hidden_dim)
 
@@ -284,7 +254,12 @@ class Qwen2MoeDecoderLayer(nn.Module):
             cache_config=cache_config,
             quant_config=quant_config,
         )
-        if (layer_idx not in config.mlp_only_layers) and (
+
+        # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
+        # `mlp_only_layers` in the config.
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
                 config.num_experts > 0 and
             (layer_idx + 1) % config.decoder_sparse_step == 0):
             self.mlp = Qwen2MoeSparseMoeBlock(config=config,
@@ -427,21 +402,36 @@ class Qwen2MoeForCausalLM(nn.Module):
             ("gate_up_proj", "up_proj", 1),
         ]
 
+        expert_params_mapping = [
+            # These are the weights for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_weight" if weight_name in ["gate_proj", "up_proj"]
+             else "experts.w2_weight",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
+            for expert_id in range(self.config.num_experts) for shard_id,
+            weight_name in enumerate(["gate_proj", "down_proj", "up_proj"])
+        ]
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_expert." in name)
-                        and name not in params_dict):
-                    continue
                 if name not in params_dict:
                     continue
 
@@ -450,17 +440,27 @@ class Qwen2MoeForCausalLM(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_expert." in name)
-                        and name not in params_dict):
-                    continue
-                if name not in params_dict:
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
-- 
GitLab


From ee93f4f92acbd9759a9af80747bc2a4459f07639 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Wed, 3 Jul 2024 06:25:17 +0800
Subject: [PATCH 242/376] [CORE] Quantized lm-head Framework (#4442)

Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
Co-authored-by: ZX <zx@lbx.dev>
---
 tests/lora/test_layers.py                     | 10 +--
 tests/quantization/test_lm_head.py            | 45 ++++++++++++
 tests/spec_decode/e2e/test_mlp_correctness.py |  2 +-
 tests/test_logits_processor.py                |  2 +-
 vllm/lora/layers.py                           |  4 +-
 .../model_executor/layers/logits_processor.py | 16 +++--
 .../layers/quantization/base_config.py        |  9 +++
 .../layers/quantization/gptq.py               | 13 +++-
 .../layers/quantization/gptq_marlin.py        | 15 ++--
 .../layers/quantization/marlin.py             | 13 +++-
 .../layers/vocab_parallel_embedding.py        | 70 ++++++++++++++-----
 vllm/model_executor/models/arctic.py          |  3 +-
 vllm/model_executor/models/baichuan.py        |  6 +-
 vllm/model_executor/models/bloom.py           |  4 +-
 vllm/model_executor/models/chatglm.py         |  7 +-
 vllm/model_executor/models/commandr.py        |  8 +--
 vllm/model_executor/models/dbrx.py            |  3 +-
 vllm/model_executor/models/deepseek.py        |  6 +-
 vllm/model_executor/models/deepseek_v2.py     |  6 +-
 vllm/model_executor/models/falcon.py          |  6 +-
 vllm/model_executor/models/gemma.py           |  4 +-
 vllm/model_executor/models/gemma2.py          |  4 +-
 vllm/model_executor/models/gpt2.py            |  4 +-
 vllm/model_executor/models/gpt_bigcode.py     |  4 +-
 vllm/model_executor/models/gpt_j.py           |  3 +-
 vllm/model_executor/models/gpt_neox.py        |  3 +-
 vllm/model_executor/models/internlm2.py       |  6 +-
 vllm/model_executor/models/jais.py            |  4 +-
 vllm/model_executor/models/llama.py           |  3 +-
 vllm/model_executor/models/llava.py           |  5 +-
 vllm/model_executor/models/llava_next.py      |  5 +-
 vllm/model_executor/models/minicpm.py         |  7 +-
 vllm/model_executor/models/mixtral.py         |  3 +-
 vllm/model_executor/models/mixtral_quant.py   |  6 +-
 vllm/model_executor/models/mlp_speculator.py  |  8 +--
 vllm/model_executor/models/mpt.py             |  4 +-
 vllm/model_executor/models/olmo.py            |  6 +-
 vllm/model_executor/models/opt.py             |  4 +-
 vllm/model_executor/models/orion.py           |  6 +-
 vllm/model_executor/models/phi.py             |  5 +-
 vllm/model_executor/models/phi3_small.py      |  3 +-
 vllm/model_executor/models/phi3v.py           |  6 +-
 vllm/model_executor/models/qwen.py            |  6 +-
 vllm/model_executor/models/qwen2.py           |  8 +--
 vllm/model_executor/models/qwen2_moe.py       |  6 +-
 vllm/model_executor/models/stablelm.py        |  6 +-
 vllm/model_executor/models/starcoder2.py      |  6 +-
 vllm/model_executor/models/xverse.py          |  6 +-
 48 files changed, 268 insertions(+), 121 deletions(-)
 create mode 100644 tests/quantization/test_lm_head.py

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 2e51e95a3..7207af6b1 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -475,10 +475,10 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=linear.weight,
+            lm_head=linear,
             embedding_bias=None)
 
-        original_weight = linear.weight.clone()
+        original_lm_head = deepcopy(linear)
 
         linear.weight[logits_processor.
                       org_vocab_size:logits_processor.org_vocab_size +
@@ -490,7 +490,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(hidden_states=input_,
-                                                  embedding=linear.weight,
+                                                  lm_head=linear,
                                                   embedding_bias=None)
             result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
             result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
@@ -519,11 +519,11 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=original_weight,
+            lm_head=original_lm_head,
             embedding_bias=None)[:, :vocab_size]
         expected_result = logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=original_weight,
+            lm_head=original_lm_head,
             embedding_bias=None)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
new file mode 100644
index 000000000..dd9a01680
--- /dev/null
+++ b/tests/quantization/test_lm_head.py
@@ -0,0 +1,45 @@
+"""Tests whether gptq models with quantized lm_head can be loaded.
+
+Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
+"""
+from typing import Tuple
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+
+PROMPT = "On the surface of Mars, we found"
+
+MODELS_QUANT = [(
+    "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
+    True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
+                ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
+
+
+@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
+def test_lm_head(
+    vllm_runner,
+    model_lm_head_quant: Tuple[str, bool],
+) -> None:
+    model, lm_head_quantized = model_lm_head_quant
+    vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048)
+
+    lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker.
+                     model_runner.model.lm_head)
+
+    if lm_head_quantized:
+        assert isinstance(
+            lm_head_layer.linear_method,
+            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
+    else:
+        assert isinstance(lm_head_layer.linear_method, UnquantizedLinearMethod)
+
+    print(
+        vllm_model.generate_greedy(prompts=["Hello my name is"],
+                                   max_tokens=10)[0][1])
+    del vllm_model
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 9a9f2acbb..dd67a7735 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -34,7 +34,7 @@ SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator"
 MAX_SPEC_TOKENS = 5
 
 # precision
-PRECISION = "float16"
+PRECISION = "float32"
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index 4ee980505..8ee2d7819 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -83,7 +83,7 @@ def test_logits_processors(seed: int, device: str):
         device=device,
         pin_memory=is_pin_memory_available())
     logits_processor_output = logits_processor(
-        embedding=None,
+        lm_head=None,
         hidden_states=input_tensor,
         sampling_metadata=sampling_metadata)
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 2fddfccaf..0a63f9ef0 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1172,11 +1172,11 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
-        embedding: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = torch.matmul(hidden_states, embedding.t())
+        logits = lm_head.linear_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
             logits += embedding_bias
         logits = tensor_model_parallel_gather(logits)
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 8062bfb51..f6fcf49ef 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -6,6 +6,8 @@ import torch
 import torch.nn as nn
 
 from vllm.distributed import tensor_model_parallel_gather
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
 
@@ -40,7 +42,7 @@ class LogitsProcessor(nn.Module):
 
     def forward(
         self,
-        embedding: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
         embedding_bias: Optional[torch.Tensor] = None,
@@ -52,8 +54,7 @@ class LogitsProcessor(nn.Module):
                                                  sampling_metadata)
 
             # Get the logits for the next tokens.
-            logits = self._get_logits(hidden_states, embedding, embedding_bias)
-
+            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
         if logits is not None:
             if self.soft_cap is not None:
                 logits = logits / self.soft_cap
@@ -68,12 +69,13 @@ class LogitsProcessor(nn.Module):
 
         return logits
 
-    def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
+    def _get_logits(self, hidden_states: torch.Tensor,
+                    lm_head: VocabParallelEmbedding,
                     embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
         # Get the logits for the next tokens.
-        logits = torch.matmul(hidden_states, embedding.t())
-        if embedding_bias is not None:
-            logits += embedding_bias
+        logits = lm_head.linear_method.apply(lm_head,
+                                             hidden_states,
+                                             bias=embedding_bias)
         logits = tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index c23b66161..1607470cb 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -87,6 +87,15 @@ class QuantizationConfig(ABC):
         raise ValueError(f"Cannot find any of {keys} in the model's "
                          "quantization config.")
 
+    @staticmethod
+    def get_from_keys_or(config: Dict[str, Any], keys: List[str],
+                         default: Any) -> Any:
+        """Get a optional value from the model's quantization config."""
+        try:
+            return QuantizationConfig.get_from_keys(config, keys)
+        except ValueError:
+            return default
+
     @abstractmethod
     def get_quant_method(
             self, layer: torch.nn.Module) -> Optional[QuantizeMethodBase]:
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index ae9f7019f..595d6ab96 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -10,6 +10,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.utils import set_weight_attrs
 
 
@@ -24,10 +25,12 @@ class GPTQConfig(QuantizationConfig):
         weight_bits: int,
         group_size: int,
         desc_act: bool,
+        lm_head_quantized: bool,
     ) -> None:
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
         self.pack_factor = Fraction(32, self.weight_bits)
         if self.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
@@ -37,7 +40,8 @@ class GPTQConfig(QuantizationConfig):
     def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})")
+                f"desc_act={self.desc_act}),"
+                f"lm_head_quantized={self.lm_head_quantized}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -61,11 +65,14 @@ class GPTQConfig(QuantizationConfig):
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
-        return cls(weight_bits, group_size, desc_act)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized)
 
     def get_quant_method(
             self, layer: torch.nn.Module) -> Optional["GPTQLinearMethod"]:
-        if isinstance(layer, LinearBase):
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             return GPTQLinearMethod(self)
         return None
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index c6e9279c8..97aae33f1 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.utils import get_device_capability_stateless
 
 logger = init_logger(__name__)
@@ -59,7 +60,7 @@ class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
     def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool) -> None:
+                 is_sym: bool, lm_head_quantized: bool) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -69,6 +70,7 @@ class GPTQMarlinConfig(QuantizationConfig):
         self.group_size = group_size
         self.desc_act = desc_act
         self.is_sym = is_sym
+        self.lm_head_quantized = lm_head_quantized
 
         # Verify
         if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
@@ -96,7 +98,8 @@ class GPTQMarlinConfig(QuantizationConfig):
     def __repr__(self) -> str:
         return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})")
+                f"desc_act={self.desc_act}, "
+                f"lm_head_quantized={self.lm_head_quantized})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -120,7 +123,10 @@ class GPTQMarlinConfig(QuantizationConfig):
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         is_sym = cls.get_from_keys(config, ["sym"])
-        return cls(weight_bits, group_size, desc_act, is_sym)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym,
+                   lm_head_quantized)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -145,7 +151,8 @@ class GPTQMarlinConfig(QuantizationConfig):
     def get_quant_method(
             self,
             layer: torch.nn.Module) -> Optional["GPTQMarlinLinearMethod"]:
-        if isinstance(layer, LinearBase):
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             return GPTQMarlinLinearMethod(self)
         return None
 
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 3613c9d9e..f0a9cf552 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -8,6 +8,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
@@ -22,9 +23,11 @@ class MarlinConfig(QuantizationConfig):
     def __init__(
         self,
         group_size: int,
+        lm_head_quantized: bool,
     ) -> None:
         # Group size for the quantization.
         self.group_size = group_size
+        self.lm_head_quantized = lm_head_quantized
         if self.group_size != 128 and self.group_size != -1:
             raise ValueError(
                 "Currently, only group size 128 and -1 (channelwise) "
@@ -51,7 +54,8 @@ class MarlinConfig(QuantizationConfig):
         self.perm_len = 1024
 
     def __repr__(self) -> str:
-        return f"MarlinConfig(group_size={self.group_size})"
+        return (f"MarlinConfig(group_size={self.group_size}, "
+                f"lm_head_quantized={self.lm_head_quantized})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -73,7 +77,9 @@ class MarlinConfig(QuantizationConfig):
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig":
         group_size = cls.get_from_keys(config, ["group_size"])
-        return cls(group_size)
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(group_size, lm_head_quantized)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -96,7 +102,8 @@ class MarlinConfig(QuantizationConfig):
 
     def get_quant_method(
             self, layer: torch.nn.Module) -> Optional["MarlinLinearMethod"]:
-        if isinstance(layer, LinearBase):
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             return MarlinLinearMethod(self)
         return None
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 4650b2c24..d70eb1c27 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -8,6 +8,9 @@ from torch.nn.parameter import Parameter
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 
 DEFAULT_VOCAB_PADDING_SIZE = 64
@@ -157,6 +160,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         params_dtype: type of the parameters.
         org_num_embeddings: original vocabulary size (without LoRA).
         padding_size: padding size for the vocabulary.
+        quant_config: quant config for the layer
     """  # noqa: E501
 
     def __init__(self,
@@ -164,7 +168,8 @@ class VocabParallelEmbedding(torch.nn.Module):
                  embedding_dim: int,
                  params_dtype: Optional[torch.dtype] = None,
                  org_num_embeddings: Optional[int] = None,
-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
+                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+                 quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
         # Keep the input dimensions.
@@ -187,6 +192,14 @@ class VocabParallelEmbedding(torch.nn.Module):
                                                self.org_vocab_size, tp_rank,
                                                self.tp_size)
         self.embedding_dim = embedding_dim
+
+        linear_method = None
+        if quant_config is not None:
+            linear_method = quant_config.get_quant_method(self)
+        if linear_method is None:
+            linear_method = UnquantizedLinearMethod()
+        self.linear_method: QuantizeMethodBase = linear_method
+
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         # Divide the weight matrix along the vocaburaly dimension.
@@ -201,14 +214,14 @@ class VocabParallelEmbedding(torch.nn.Module):
         self.num_added_embeddings_per_partition = (
             self.shard_indices.added_vocab_end_index -
             self.shard_indices.added_vocab_start_index)
-        self.weight = Parameter(
-            torch.empty(self.num_embeddings_per_partition,
-                        self.embedding_dim,
-                        dtype=params_dtype))
-        set_weight_attrs(self.weight, {
-            "parallel_dim": 0,
-            "weight_loader": self.weight_loader
-        })
+
+        self.linear_method.create_weights(self,
+                                          self.embedding_dim,
+                                          [self.num_embeddings_per_partition],
+                                          self.embedding_dim,
+                                          self.num_embeddings_padded,
+                                          params_dtype=params_dtype,
+                                          weight_loader=self.weight_loader)
 
     @classmethod
     def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
@@ -288,10 +301,32 @@ class VocabParallelEmbedding(torch.nn.Module):
         return ret
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        parallel_dim = param.parallel_dim
-        assert loaded_weight.shape[parallel_dim] == self.org_vocab_size
-        loaded_weight = loaded_weight[self.shard_indices.org_vocab_start_index:
-                                      self.shard_indices.org_vocab_end_index]
+        output_dim = getattr(param, "output_dim", None)
+        packed_dim = getattr(param, "packed_dim", None)
+
+        # If parameter does not have output dim, then it should
+        # be copied onto all gpus (e.g. g_idx for act_order gptq).
+        if output_dim is None:
+            assert param.data.shape == loaded_weight.shape
+            param.data.copy_(loaded_weight)
+            return
+
+        # Shard indexes for loading the weight
+        start_idx = self.shard_indices.org_vocab_start_index
+        shard_size = self.shard_indices.org_vocab_end_index - start_idx
+
+        # If param packed on the same dim we are sharding on, then
+        # need to adjust offsets of loaded weight by pack_factor.
+        if packed_dim is not None and packed_dim == output_dim:
+            assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
+                                                       param.pack_factor)
+            start_idx = start_idx // param.pack_factor
+            shard_size = shard_size // param.pack_factor
+        else:
+            assert loaded_weight.shape[output_dim] == self.org_vocab_size
+
+        # Copy the data.
+        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
         param[loaded_weight.shape[0]:].data.fill_(0)
 
@@ -346,16 +381,17 @@ class ParallelLMHead(VocabParallelEmbedding):
                  bias: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  org_num_embeddings: Optional[int] = None,
-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
+                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+                 quant_config: Optional[QuantizationConfig] = None):
         super().__init__(num_embeddings, embedding_dim, params_dtype,
-                         org_num_embeddings, padding_size)
+                         org_num_embeddings, padding_size, quant_config)
         if bias:
             self.bias = Parameter(
                 torch.empty(self.num_embeddings_per_partition,
                             dtype=params_dtype))
             set_weight_attrs(self.bias, {
-                "parallel_dim": 0,
-                "weight_loader": self.weight_loader
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
             })
         else:
             self.register_parameter("bias", None)
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index fec52e016..49e57a847 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -412,6 +412,7 @@ class ArcticForCausalLM(nn.Module):
         self.lm_head = ParallelLMHead(
             self.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
         )
         self.num_experts = config.num_local_experts
         self.num_experts_per_tok = config.num_experts_per_tok
@@ -434,7 +435,7 @@ class ArcticForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index ddc4e9084..e1ea8bfca 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -328,7 +328,9 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
         self.quant_config = quant_config
         self.model = BaiChuanModel(config, position_embedding, cache_config,
                                    quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -346,7 +348,7 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 8387c8e37..86ae32e0c 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -276,7 +276,7 @@ class BloomForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.transformer = BloomModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.word_embeddings.weight
+        self.lm_head = self.transformer.word_embeddings
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -294,7 +294,7 @@ class BloomForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index e6012a6d4..553ddf904 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -303,7 +303,8 @@ class ChatGLMModel(nn.Module):
         self.encoder = GLMTransformer(config, cache_config, quant_config)
 
         self.output_layer = ParallelLMHead(config.padded_vocab_size,
-                                           config.hidden_size)
+                                           config.hidden_size,
+                                           quant_config=quant_config)
 
     def forward(
         self,
@@ -355,7 +356,7 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
         self.max_position_embeddings = getattr(config, "max_sequence_length",
                                                8192)
         self.transformer = ChatGLMModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.output_layer.weight
+        self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
         self.sampler = Sampler()
 
@@ -373,7 +374,7 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 2961f421e..5f6e3a134 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -363,12 +363,12 @@ class CohereForCausalLM(nn.Module):
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
         is_not_lora = hasattr(self.model.embed_tokens, 'weight')
         if is_not_lora:
-            embedding_weights = self.model.embed_tokens.weight
+            logits = self.logits_processor(self.model.embed_tokens,
+                                           hidden_states, sampling_metadata)
         else:
-            embedding_weights = self.model.embed_tokens.base_layer.weight
+            logits = self.logits_processor(self.model.embed_tokens.base_layer,
+                                           hidden_states, sampling_metadata)
 
-        logits = self.logits_processor(embedding_weights, hidden_states,
-                                       sampling_metadata)
         return logits
 
     def sample(
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 210cf6165..d758333b2 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -370,6 +370,7 @@ class DbrxForCausalLM(nn.Module):
             config.d_model,
             org_num_embeddings=config.vocab_size,
             padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
@@ -389,7 +390,7 @@ class DbrxForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index e9ceca9b1..3fd6f2218 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -377,7 +377,9 @@ class DeepseekForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = DeepseekModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -395,7 +397,7 @@ class DeepseekForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 3cf62afd9..fb4097fd1 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -465,7 +465,9 @@ class DeepseekV2ForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = DeepseekV2Model(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -483,7 +485,7 @@ class DeepseekV2ForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 89b0bbf01..93f07327e 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -394,13 +394,13 @@ class FalconForCausalLM(nn.Module):
                                     if config.tie_word_embeddings is not None
                                     else True)
         if self.tie_word_embeddings:
-            self.lm_head_weight = self.transformer.word_embeddings.weight
+            self.lm_head = self.transformer.word_embeddings
         else:
             self.lm_head = ParallelLMHead(
                 config.vocab_size,
                 config.hidden_size,
+                quant_config=quant_config,
             )
-            self.lm_head_weight = self.lm_head.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -422,7 +422,7 @@ class FalconForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 0a5a7ed3d..b603a5911 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -347,8 +347,8 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.model.embed_tokens.weight,
-                                       hidden_states, sampling_metadata)
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def sample(
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 1f921c8bd..8fedff625 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -346,8 +346,8 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.model.embed_tokens.weight,
-                                       hidden_states, sampling_metadata)
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def sample(
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 55f2e2741..be19f4ba8 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -238,7 +238,7 @@ class GPT2LMHeadModel(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.transformer = GPT2Model(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.wte.weight
+        self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -256,7 +256,7 @@ class GPT2LMHeadModel(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 7d0bf39c5..cc42413d5 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -259,7 +259,7 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
         self.quant_config = quant_config
         self.transformer = GPTBigCodeModel(config, cache_config, quant_config,
                                            lora_config)
-        self.lm_head_weight = self.transformer.wte.weight
+        self.lm_head = self.transformer.wte
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -281,7 +281,7 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index de7f86af7..4bb9debe7 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -229,6 +229,7 @@ class GPTJForCausalLM(nn.Module):
             config.vocab_size,
             config.n_embd,
             bias=True,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -247,7 +248,7 @@ class GPTJForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata, self.lm_head.bias)
         return logits
 
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 3658b8fbf..b306574b2 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -241,6 +241,7 @@ class GPTNeoXForCausalLM(nn.Module):
         self.embed_out = ParallelLMHead(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -259,7 +260,7 @@ class GPTNeoXForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.embed_out.weight, hidden_states,
+        logits = self.logits_processor(self.embed_out, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 283bc064b..22132f40f 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -253,7 +253,9 @@ class InternLM2ForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = InternLM2Model(config, cache_config, quant_config)
-        self.output = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.output = ParallelLMHead(config.vocab_size,
+                                     config.hidden_size,
+                                     quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -271,7 +273,7 @@ class InternLM2ForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.output.weight, hidden_states,
+        logits = self.logits_processor(self.output, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 2758e2d0b..0030c761d 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -273,7 +273,7 @@ class JAISLMHeadModel(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.transformer = JAISModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.wte.weight
+        self.lm_head = self.transformer.wte
         if hasattr(config, "width_scale"):
             self.output_logits_scale = config.width_scale
         else:
@@ -297,7 +297,7 @@ class JAISLMHeadModel(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index af75b6bee..77edcd740 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -380,6 +380,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
             # We need bigger padding if using lora for kernel
             # compatibility
             if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
         )
         if config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
@@ -403,7 +404,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 39c47dddf..bbec4dbd8 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -125,7 +125,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
         self.lm_head = ParallelLMHead(
             self.unpadded_vocab_size,
             config.text_config.hidden_size,
-            org_num_embeddings=self.language_model.org_vocab_size)
+            org_num_embeddings=self.language_model.org_vocab_size,
+            quant_config=quant_config)
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
@@ -255,7 +256,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 8b078391b..f67598c40 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -186,7 +186,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         self.lm_head = ParallelLMHead(
             self.unpadded_vocab_size,
             config.text_config.hidden_size,
-            org_num_embeddings=self.language_model.org_vocab_size)
+            org_num_embeddings=self.language_model.org_vocab_size,
+            quant_config=quant_config)
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
@@ -438,7 +439,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 330204327..4ccf1cf0f 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -449,6 +449,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
                 # We need bigger padding if using lora for kernel
                 # compatibility
                 if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
             )
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
@@ -472,10 +473,10 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
         hidden_states = hidden_states / self.scale_width
         if self.config.tie_word_embeddings:
-            lm_head_weight = self.model.embed_tokens.weight
+            lm_head = self.model.embed_tokens
         else:
-            lm_head_weight = self.lm_head.weight
-        logits = self.logits_processor(lm_head_weight, hidden_states,
+            lm_head = self.lm_head
+        logits = self.logits_processor(lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 5144e7ea4..7f5e3b969 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -331,6 +331,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
             # We need bigger padding if using lora for kernel
             # compatibility
             if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
@@ -350,7 +351,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index dde2da20b..10faa5cc6 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -344,7 +344,9 @@ class MixtralForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = MixtralModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -362,7 +364,7 @@ class MixtralForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 290a703af..97f7ec742 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -8,7 +8,7 @@ from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import MLPSpeculatorConfig
@@ -87,7 +87,7 @@ class MLPSpeculator(nn.Module):
             self.proj = nn.ModuleList([proj_first] + [proj_tied] *
                                       (self.max_speculative_tokens - 1))
 
-            head = nn.Linear(self.inner_dim, self.vocab_size, bias=False)
+            head = ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
             self.head = nn.ModuleList([head] * self.max_speculative_tokens)
 
             ln = MLPSpeculatorLayerNorm(self.inner_dim,
@@ -169,8 +169,8 @@ class MLPSpeculator(nn.Module):
             # TODO: not yet supporting top_k_tokens_per_head
             previous_hidden_states = states
 
-            logits = self.logits_processor(self.head[head_index].weight,
-                                           states, sampling_metadata)
+            logits = self.logits_processor(self.head[head_index], states,
+                                           sampling_metadata)
 
             output = self.sampler(logits.flatten(0, 1), sampling_metadata)
             last_tokens = output.sampled_token_ids
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 28dc5922c..7d658b39e 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -263,7 +263,7 @@ class MPTForCausalLM(nn.Module):
         self.quant_config = quant_config
 
         self.transformer = MPTModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.transformer.wte.weight
+        self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -281,7 +281,7 @@ class MPTForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 53215f32b..408c0c883 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -283,15 +283,15 @@ class OlmoForCausalLM(nn.Module):
         self.config = config
         self.model = OlmoModel(config, cache_config, quant_config)
         if config.tie_word_embeddings:
-            self.lm_head_weight = self.model.embed_tokens.weight
+            self.lm_head = self.model.embed_tokens
         else:
             self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
                 self.unpadded_vocab_size,
                 config.hidden_size,
                 org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
             )
-            self.lm_head_weight = self.lm_head.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -313,7 +313,7 @@ class OlmoForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index d12a51af5..edc16710c 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -294,7 +294,7 @@ class OPTForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = OPTModel(config, cache_config, quant_config)
-        self.lm_head_weight = self.model.decoder.embed_tokens.weight
+        self.lm_head = self.model.decoder.embed_tokens
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -312,7 +312,7 @@ class OPTForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index a298f0307..8159cc13f 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -259,7 +259,9 @@ class OrionForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = OrionModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -277,7 +279,7 @@ class OrionForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index cc8e31fe1..ac7496f68 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -268,7 +268,8 @@ class PhiForCausalLM(nn.Module, SupportsLoRA):
 
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
-                                      bias=True)
+                                      bias=True,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -287,7 +288,7 @@ class PhiForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata, self.lm_head.bias)
         return logits
 
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 706ae6520..cc06929fe 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -366,6 +366,7 @@ class Phi3SmallForCausalLM(nn.Module):
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
             padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -400,7 +401,7 @@ class Phi3SmallForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         if self.dummy_token_indices is not None and logits is not None:
             logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index eff4e5029..d73a42026 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -365,7 +365,9 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
         self.model = LlamaModel(config, cache_config, quant_config)
         self.vision_embed_tokens = Phi3HDImageEmbedding(
             vlm_config, config, self.model.embed_tokens)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -409,7 +411,7 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 408c206c5..47c85c783 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -235,7 +235,9 @@ class QWenLMHeadModel(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.transformer = QWenModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -253,7 +255,7 @@ class QWenLMHeadModel(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 3691a3d2e..e9ae2192f 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -316,11 +316,11 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
         self.model = Qwen2Model(config, cache_config, quant_config)
 
         if config.tie_word_embeddings:
-            self.lm_head_weight = self.model.embed_tokens.weight
+            self.lm_head = self.model.embed_tokens
         else:
             self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size)
-            self.lm_head_weight = self.lm_head.weight
+                                          config.hidden_size,
+                                          quant_config=quant_config)
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -339,7 +339,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 8decb4464..ccaa6f208 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -362,7 +362,9 @@ class Qwen2MoeForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = Qwen2MoeModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -380,7 +382,7 @@ class Qwen2MoeForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 1098b3031..5451b56ed 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -240,7 +240,9 @@ class StablelmForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = StableLMEpochModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -258,7 +260,7 @@ class StablelmForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 6f3d5d51d..1752bfd47 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -242,7 +242,7 @@ class Starcoder2ForCausalLM(nn.Module):
         self.vocab_size = config.vocab_size
         self.unpadded_vocab_size = config.vocab_size
         if config.tie_word_embeddings:
-            self.lm_head_weight = self.model.embed_tokens.weight
+            self.lm_head = self.model.embed_tokens
         else:
             self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
@@ -250,8 +250,8 @@ class Starcoder2ForCausalLM(nn.Module):
                 config.hidden_size,
                 org_num_embeddings=config.vocab_size,
                 padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=quant_config,
             )
-            self.lm_head_weight = self.lm_head.weight
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
         self.sampler = Sampler()
@@ -270,7 +270,7 @@ class Starcoder2ForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 08d3efd33..84f0ffc37 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -310,7 +310,9 @@ class XverseForCausalLM(nn.Module, SupportsLoRA):
 
         self.quant_config = quant_config
         self.model = XverseModel(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -328,7 +330,7 @@ class XverseForCausalLM(nn.Module, SupportsLoRA):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
-- 
GitLab


From 9d6a8daa87e2e0af3ff45d03d08ad5a94ec089a8 Mon Sep 17 00:00:00 2001
From: Mor Zusman <mor.zusmann@gmail.com>
Date: Wed, 3 Jul 2024 02:11:29 +0300
Subject: [PATCH 243/376] [Model] Jamba support (#4115)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Erez Schwartz <erezs@ai21.com>
Co-authored-by: Mor Zusman <morz@ai21.com>
Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: Tomer Asida <tomera@ai21.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
---
 .buildkite/run-cpu-test.sh              |   2 +-
 Dockerfile                              |  23 +
 docs/source/models/supported_models.rst |   4 +
 requirements-mamba.txt                  |   3 +
 tests/models/test_jamba.py              |  65 ++
 vllm/config.py                          |  29 +-
 vllm/core/scheduler.py                  |  16 +-
 vllm/engine/async_llm_engine.py         |   4 +-
 vllm/engine/llm_engine.py               |   4 +-
 vllm/model_executor/models/__init__.py  |   1 +
 vllm/model_executor/models/jamba.py     | 955 ++++++++++++++++++++++++
 vllm/sequence.py                        |   4 +-
 vllm/spec_decode/draft_model_runner.py  |  12 +-
 vllm/worker/cache_engine.py             |  15 +-
 vllm/worker/cpu_model_runner.py         |   7 +-
 vllm/worker/embedding_model_runner.py   |   3 +-
 vllm/worker/model_runner.py             |  67 +-
 vllm/worker/model_runner_base.py        |   1 +
 vllm/worker/neuron_model_runner.py      |   1 +
 vllm/worker/worker_base.py              |   3 +-
 vllm/worker/xpu_model_runner.py         |   7 +-
 21 files changed, 1192 insertions(+), 34 deletions(-)
 create mode 100644 requirements-mamba.txt
 create mode 100644 tests/models/test_jamba.py
 create mode 100644 vllm/model_executor/models/jamba.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index f4fa24be1..9d4b2bb1c 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,4 +23,4 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
   cd ../
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
diff --git a/Dockerfile b/Dockerfile
index d031d98c5..f571e8be4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,6 +43,10 @@ COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+COPY requirements-mamba.txt requirements-mamba.txt
+RUN python3 -m pip install packaging
+RUN python3 -m pip install -r requirements-mamba.txt
+
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -123,6 +127,21 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
 #################### DEV IMAGE ####################
+#################### MAMBA Build IMAGE ####################
+FROM dev as mamba-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+
+WORKDIR /usr/src/mamba
+
+COPY requirements-mamba.txt requirements-mamba.txt
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel -r requirements-mamba.txt \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### MAMBA Build IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
@@ -143,6 +162,10 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
+
+RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
 #################### vLLM installation IMAGE ####################
 
 
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 544322582..0283f36ea 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -87,6 +87,10 @@ Alongside each architecture, we include some popular models that use it.
     - Jais
     - :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
     -
+  * - :code:`JambaForCausalLM`
+    - Jamba
+    - :code:`ai21labs/Jamba-v0.1`, etc.
+    - ✅︎
   * - :code:`LlamaForCausalLM`
     - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
     - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
diff --git a/requirements-mamba.txt b/requirements-mamba.txt
new file mode 100644
index 000000000..1838e87d0
--- /dev/null
+++ b/requirements-mamba.txt
@@ -0,0 +1,3 @@
+# Mamba dependencies
+mamba-ssm>=1.2.2
+causal-conv1d>=1.2.0
diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
new file mode 100644
index 000000000..d7e3a2fc4
--- /dev/null
+++ b/tests/models/test_jamba.py
@@ -0,0 +1,65 @@
+import pytest
+
+MODELS = ["ai21labs/Jamba-tiny-random"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # To pass the small model tests, we need full precision.
+    assert dtype == "float"
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_state_cleanup(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is for verifying that the Jamba state is cleaned up between
+    # steps, If its not cleaned, an error would be expected.
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            for _ in range(10):
+                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
+    except ValueError:
+        pytest.fail("Jamba inner state wasn't cleaned up between states, "
+                    "could be related to finished_requests_ids")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/vllm/config.py b/vllm/config.py
index 9a7e0ea7a..8c449323f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -386,9 +386,36 @@ class ModelConfig:
         return num_heads // parallel_config.tensor_parallel_size
 
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
-        total_num_hidden_layers = self.hf_text_config.num_hidden_layers
+        total_num_hidden_layers = getattr(self.hf_text_config,
+                                          "num_hidden_layers", 0)
         return total_num_hidden_layers // parallel_config.pipeline_parallel_size
 
+    def contains_seqlen_agnostic_layers(
+            self, parallel_config: "ParallelConfig") -> bool:
+        """True for Mamba/SSM models (Jamba)"""
+        return self._get_num_seqlen_agnostic_layers(parallel_config) > 0
+
+    def get_layers_block_type(self,
+                              parallel_config: "ParallelConfig") -> List[str]:
+        num_layers = self.get_num_layers(parallel_config)
+        # Transformers supports layers_block_type @property
+        return getattr(self.hf_config, "layers_block_type",
+                       ["attention"] * num_layers)
+
+    def get_num_attention_layers(self,
+                                 parallel_config: "ParallelConfig") -> int:
+        return len([
+            t for t in self.get_layers_block_type(parallel_config)
+            if t == "attention"
+        ])
+
+    def _get_num_seqlen_agnostic_layers(
+            self, parallel_config: "ParallelConfig") -> int:
+        return len([
+            t for t in self.get_layers_block_type(parallel_config)
+            if t != "attention"
+        ])
+
 
 class CacheConfig:
     """Configuration for the KV cache.
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 5fb3b7814..9e626b288 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -299,7 +299,10 @@ class Scheduler:
         # Sequence groups in the SWAPPED state.
         # Contain decode requests that are swapped out.
         self.swapped: Deque[SequenceGroup] = deque()
-
+        # Sequence groups finished requests ids since last step iteration.
+        # It lets the model know that any state associated with these requests
+        # can and must be released after the current step.
+        self._finished_requests_ids: List[str] = list()
         # Time at previous scheduling step
         self.prev_time = 0.0
         # Did we schedule a prompt at previous step?
@@ -373,6 +376,12 @@ class Scheduler:
     def get_num_unfinished_seq_groups(self) -> int:
         return len(self.waiting) + len(self.running) + len(self.swapped)
 
+    def get_and_reset_finished_requests_ids(self) -> List[str]:
+        """Flushes the list of request ids of previously finished seq_groups."""
+        finished_requests_ids = self._finished_requests_ids
+        self._finished_requests_ids = list()
+        return finished_requests_ids
+
     def _schedule_running(
         self,
         running_queue: deque,
@@ -1036,6 +1045,11 @@ class Scheduler:
         self.block_manager.free(seq)
 
     def free_finished_seq_groups(self) -> None:
+        for queue in [self.running, self.swapped, self.waiting]:
+            self._finished_requests_ids += [
+                seq_group.request_id for seq_group in queue
+                if seq_group.is_finished()
+            ]
         self.running = deque(seq_group for seq_group in self.running
                              if not seq_group.is_finished())
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 0ce511ce4..13b4635cb 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -224,6 +224,8 @@ class _AsyncLLMEngine(LLMEngine):
         """
         seq_group_metadata_list, scheduler_outputs = self.scheduler[
             virtual_engine].schedule()
+        finished_requests_ids = self.scheduler[
+            virtual_engine].get_and_reset_finished_requests_ids()
 
         if not scheduler_outputs.is_empty():
             # Execute the model.
@@ -235,7 +237,7 @@ class _AsyncLLMEngine(LLMEngine):
                 virtual_engine=virtual_engine,
                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
                 running_queue_size=scheduler_outputs.running_queue_size,
-            )
+                finished_requests_ids=finished_requests_ids)
             output = await self.model_executor.execute_model_async(
                 execute_model_req)
         else:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a79057005..a7428d010 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -846,6 +846,8 @@ class LLMEngine:
                 "as performance will be severely degraded otherwise.")
         seq_group_metadata_list, scheduler_outputs = self.scheduler[
             0].schedule()
+        finished_requests_ids = self.scheduler[
+            0].get_and_reset_finished_requests_ids()
 
         if not scheduler_outputs.is_empty():
             execute_model_req = ExecuteModelRequest(
@@ -855,7 +857,7 @@ class LLMEngine:
                 blocks_to_copy=scheduler_outputs.blocks_to_copy,
                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
                 running_queue_size=scheduler_outputs.running_queue_size,
-            )
+                finished_requests_ids=finished_requests_ids)
             output = self.model_executor.execute_model(
                 execute_model_req=execute_model_req)
         else:
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 69a65ff02..a4fe18d52 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -63,6 +63,7 @@ _GENERATION_MODELS = {
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
+    "JambaForCausalLM": ("jamba", "JambaForCausalLM")
 }
 
 _EMBEDDING_MODELS = {
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
new file mode 100644
index 000000000..c485d3779
--- /dev/null
+++ b/vllm/model_executor/models/jamba.py
@@ -0,0 +1,955 @@
+# coding=utf-8
+"""Inference-only Jurassic model."""
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Tuple
+
+import torch
+from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+from torch import nn
+from torch.nn.parameter import Parameter
+from transformers import JambaConfig
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.worker.model_runner import _BATCH_SIZES_TO_CAPTURE
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+@dataclass
+class MambaCacheParams:
+    is_prompt: bool = False
+    conv_state: torch.Tensor = torch.Tensor()
+    ssm_state: torch.Tensor = torch.Tensor()
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class JambaMambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+
+    def __init__(self, config: JambaConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.time_step_rank = config.mamba_dt_rank
+        self.use_conv_bias = config.mamba_conv_bias
+        self.use_bias = config.mamba_proj_bias
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.intermediate_size,
+            bias=self.use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(self.hidden_size,
+                                                  [self.intermediate_size] * 2,
+                                                  bias=self.use_bias)
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.time_step_rank + self.ssm_state_size * 2,
+            bias=False,
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(self.time_step_rank,
+                                            self.intermediate_size,
+                                            bias=True,
+                                            skip_bias_add=True)
+
+        def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            tp_rank = get_tensor_model_parallel_rank()
+            tp_size = get_tensor_model_parallel_world_size()
+            param.data.copy_(
+                loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
+                                         dim=0)[tp_rank])
+
+        def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            weight_loader(param, -torch.exp(loaded_weight.float()))
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                self.intermediate_size // tp_size,
+                self.ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": weight_loader})
+        set_weight_attrs(self.A, {"weight_loader": A_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=self.use_bias,
+            input_is_parallel=True,
+        )
+        self.activation = config.hidden_act
+
+        self.dt_layernorm = RMSNorm(self.time_step_rank,
+                                    eps=config.rms_norm_eps)
+        self.b_layernorm = RMSNorm(self.ssm_state_size,
+                                   eps=config.rms_norm_eps)
+        self.c_layernorm = RMSNorm(self.ssm_state_size,
+                                   eps=config.rms_norm_eps)
+
+    def mamba_forward(self,
+                      hidden_states: torch.Tensor,
+                      cache_params: MambaCacheParams = None):
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0].transpose(1, 2)
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+        if cache_params is not None and not cache_params.is_prompt:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
+        else:
+            if cache_params is not None:
+                conv_states = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_state.copy_(conv_states)
+
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+            )
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))[0]
+
+        time_step, B, C = torch.split(
+            ssm_parameters,
+            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
+            dim=-1,
+        )
+        time_step = self.dt_layernorm(time_step.contiguous())
+        B = self.b_layernorm(B.contiguous())
+        C = self.c_layernorm(C.contiguous())
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(1, 2)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+        if cache_params is not None and not cache_params.is_prompt:
+            scan_outputs = selective_state_update(
+                cache_params.ssm_state,
+                hidden_states[..., 0],
+                discrete_time_step[..., 0],
+                self.A,
+                B[:, 0],
+                C[:, 0],
+                self.D,
+                gate[..., 0],
+                time_proj_bias,
+                dt_softplus=True,
+            ).unsqueeze(-1)
+        else:
+            scan_outputs, ssm_state = selective_scan_fn(
+                hidden_states,
+                discrete_time_step,
+                self.A,
+                B.transpose(1, 2),
+                C.transpose(1, 2),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                return_last_state=True,
+            )
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_state.copy_(ssm_state)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))[0]
+        return contextualized_states
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+    ):
+        if attn_metadata.prefill_metadata is not None:
+            offset = 0
+            for i, prompt_len in enumerate(
+                    attn_metadata.prefill_metadata.seq_lens):
+                cache = MambaCacheParams(True,
+                                         conv_state=conv_state[i].unsqueeze(0),
+                                         ssm_state=ssm_state[i].unsqueeze(0))
+                hidden_states[offset:offset + prompt_len].copy_(
+                    self.mamba_forward(hidden_states[offset:offset +
+                                                     prompt_len].unsqueeze(0),
+                                       cache_params=cache)[0])
+                offset += prompt_len
+        else:
+            cache = MambaCacheParams(False,
+                                     conv_state=conv_state,
+                                     ssm_state=ssm_state)
+            hidden_states = self.mamba_forward(hidden_states.unsqueeze(1),
+                                               cache_params=cache)
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class JambaMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        hidden_act = config.hidden_act
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class JambaMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Mixtral that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        params_dtype: Optional[torch.dtype] = None,
+        tp_size: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size // self.tp_size
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.router = ReplicatedLinear(self.hidden_size,
+                                       self.num_total_experts,
+                                       bias=False,
+                                       params_dtype=self.params_dtype)
+
+        self.ws = nn.Parameter(
+            torch.empty(
+                self.num_total_experts,
+                2 * self.intermediate_size,
+                self.hidden_size,
+                device="cuda",
+                dtype=self.params_dtype,
+            ))
+        self.w2s = nn.Parameter(
+            torch.empty(
+                self.num_total_experts,
+                self.hidden_size,
+                self.intermediate_size,
+                device="cuda",
+                dtype=self.params_dtype,
+            ))
+
+        set_weight_attrs(
+            self.ws,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+        set_weight_attrs(
+            self.w2s,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+
+    def weight_loader(
+        self,
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        expert_id: int,
+    ):
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        if weight_name.endswith("gate_proj.weight"):
+            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("up_proj.weight"):
+            param_data[expert_id,
+                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("down_proj.weight"):
+            param_data[expert_id, :, :] = loaded_weight[:, shard]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits, _ = self.router(hidden_states)
+
+        final_hidden_states = fused_moe(
+            hidden_states,
+            self.ws,
+            self.w2s,
+            router_logits,
+            self.top_k,
+            renormalize=
+            False,  # Mixtral normalize the expert probs to 1. We don't!
+            inplace=True,
+        )
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+class JambaMambaDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: JambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+        self.mamba = JambaMambaMixer(config, layer_idx)
+
+        num_experts = config.layers_num_experts[layer_idx]
+        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.mamba(hidden_states, attn_metadata, conv_state,
+                                   ssm_state)
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class JambaAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+        )
+
+        num_experts = config.layers_num_experts[layer_idx]
+        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": JambaAttentionDecoderLayer,
+    "mamba": JambaMambaDecoderLayer
+}
+
+
+class JambaModel(nn.Module):
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        decoder_layers = []
+        for i in range(config.num_hidden_layers):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]]
+            decoder_layers.append(
+                layer_class(config,
+                            layer_idx=i,
+                            cache_config=cache_config,
+                            quant_config=quant_config))
+        self.layers = nn.ModuleList(decoder_layers)
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            kv_cache = None
+            current_ssm_state = None
+            current_conv_state = None
+            if isinstance(layer, JambaAttentionDecoderLayer):
+                kv_cache = kv_caches[(i - self.config.attn_layer_offset) //
+                                     self.config.attn_layer_period]
+            if isinstance(layer, JambaMambaDecoderLayer):
+                current_state_layer = i - (1 +
+                                           (i - self.config.attn_layer_offset)
+                                           // self.config.attn_layer_period)
+                current_ssm_state = ssm_state[current_state_layer]
+                current_conv_state = conv_state[current_state_layer]
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                kv_cache=kv_cache,
+                attn_metadata=attn_metadata,
+                residual=residual,
+                conv_state=current_conv_state,
+                ssm_state=current_ssm_state,
+            )
+        hidden_states, _ = self.final_layernorm(hidden_states, residual)
+        return hidden_states
+
+
+class JambaForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(
+        self,
+        config: JambaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.model = JambaModel(config,
+                                cache_config=cache_config,
+                                quant_config=quant_config,
+                                lora_config=lora_config)
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Current step used indices
+        self.current_indices: List[int] = []
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Tuple[torch.Tensor, torch.Tensor] = tuple()
+        # Used as an input_buffer for the CUDA graph runs.
+        self.mamba_gc_cache_buffer: Tuple[torch.Tensor, torch.Tensor] = tuple()
+        # Maps between the request id and a dict that maps between the seq_id
+        # and its index inside the self.mamba_cache
+        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[KVCache],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                **kwargs):
+        if not self.mamba_cache:
+            self._prepare_mamba_cache()
+
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            # We get here only on Prefill/Eager mode runs
+            assert all(
+                key in kwargs
+                for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+
+            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+            batch_size = input_ids.shape[0]
+            if attn_metadata.prefill_metadata:
+                batch_size = len(request_ids_to_seq_ids)
+            (
+                current_seqlen_agnostic_cache,
+                indices,
+            ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
+                                                      batch_size)
+            finished_requests_ids = kwargs["finished_requests_ids"]
+            self._release_mamba_cache(finished_requests_ids)
+        else:
+            # CUDA graph capturing runs
+            current_seqlen_agnostic_cache, indices = (
+                kwargs["seqlen_agnostic_capture_inputs"],
+                [],
+            )
+        self.current_indices = indices
+
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata,
+                                   current_seqlen_agnostic_cache[0],
+                                   current_seqlen_agnostic_cache[1])
+
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            self._copy_mamba_cache_by_indices(self.current_indices,
+                                              current_seqlen_agnostic_cache)
+
+        return hidden_states
+
+    def _copy_mamba_cache_by_indices(
+            self, indices: List[int],
+            current_seqlen_agnostic_cache: Tuple[torch.Tensor, torch.Tensor]):
+        for i, offset in enumerate(indices):
+            self._copy_mamba_cache(offset, i, current_seqlen_agnostic_cache)
+
+    def _copy_mamba_cache(self, index_to: int, index_from: int,
+                          from_buffer: Tuple[torch.Tensor, torch.Tensor]):
+        assert len(self.mamba_cache) > 0
+        for (cache_t, from_buffer_t) in zip(self.mamba_cache, from_buffer):
+            cache_t[:, index_to].copy_(from_buffer_t[:, index_from],
+                                       non_blocking=True)
+
+    def _assign_seq_id_to_mamba_cache(self, cur_rid: str,
+                                      seqs_id: List[int]) -> List[int]:
+        indices_for_current_run = []
+        for seq_id in seqs_id:
+            if cur_rid not in self.mamba_cache_indices_mapping:
+                self.mamba_cache_indices_mapping[cur_rid] = {}
+                first_free_index = self._first_free_index_in_mamba_cache()
+                self.mamba_cache_indices_mapping[cur_rid][
+                    seq_id] = first_free_index
+                index_for_current_run = first_free_index
+            ## case of decoding n>1, copy prefill cache to decoding indices
+            elif seq_id not in (seq_ids2indices :=
+                                self.mamba_cache_indices_mapping[cur_rid]):
+                first_free_index = self._first_free_index_in_mamba_cache()
+                index_exist = list(seq_ids2indices.values())[0]
+                self._copy_mamba_cache(index_from=index_exist,
+                                       index_to=first_free_index,
+                                       from_buffer=self.mamba_cache)
+                self.mamba_cache_indices_mapping[cur_rid][
+                    seq_id] = first_free_index
+                index_for_current_run = first_free_index
+            else:
+                index_for_current_run = self.mamba_cache_indices_mapping[
+                    cur_rid][seq_id]
+
+            indices_for_current_run.append(index_for_current_run)
+        return indices_for_current_run
+
+    def _prepare_current_run_mamba_cache(
+        self, request_ids_to_seq_ids: Dict[str, list[int]], batch_size: int
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], List[int]]:
+        indices_for_current_run = []
+        for request_id, seqs_id in request_ids_to_seq_ids.items():
+            indices_for_current_run += self._assign_seq_id_to_mamba_cache(
+                request_id, seqs_id)
+        ## Pad the batch in case of running batch that was not captured via CG
+        padded_indices = indices_for_current_run.copy()
+        pad_index = self._first_free_index_in_mamba_cache()
+
+        for _ in range(batch_size - len(indices_for_current_run)):
+            padded_indices.append(pad_index)
+
+        conv_state = self.mamba_cache[0][:, padded_indices]
+        temporal_state = self.mamba_cache[1][:, padded_indices]
+
+        return (conv_state, temporal_state), indices_for_current_run
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant Mamba cache into the CUDA graph input buffer 
+        that was provided during the capture runs 
+        (JambaForCausalLM.mamba_gc_cache_buffer). 
+        """
+        assert all(
+            key in kwargs
+            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+        batch_size = len(request_ids_to_seq_ids)
+        (
+            current_mamba_cache,
+            indices,
+        ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
+                                                  batch_size)
+        self.current_indices = indices
+        finished_requests_ids = kwargs["finished_requests_ids"]
+        self._release_mamba_cache(finished_requests_ids)
+
+        for input_buffer, current_cache_buffer in zip(
+                input_buffers["seqlen_agnostic_capture_inputs"],
+                current_mamba_cache):
+            input_buffer.copy_(current_cache_buffer, non_blocking=True)
+
+    def copy_outputs_after_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant Mamba cache from the CUDA graph input_buffers
+        back to the JambaForCausalLM.mamba_cache after CUDA 
+        graph replay run is done.
+        """
+        self._copy_mamba_cache_by_indices(
+            self.current_indices,
+            input_buffers["seqlen_agnostic_capture_inputs"])
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Mamba Cache during the CUDA graph 
+        replay runs.
+        """
+        return tuple(buffer[:, :batch_size]
+                     for buffer in self.mamba_gc_cache_buffer)
+
+    def _release_mamba_cache(self, finished_seq_groups_req_ids: List[str]):
+        for req_id in finished_seq_groups_req_ids:
+            if req_id in self.mamba_cache_indices_mapping:
+                self.mamba_cache_indices_mapping.pop(req_id)
+
+    def _first_free_index_in_mamba_cache(self) -> int:
+        if self.mamba_cache:
+            max_possible_batch_size = self.mamba_cache[0].shape[1]
+            occupied = [
+                id for seq_ids in self.mamba_cache_indices_mapping.values()
+                for id in seq_ids.values()
+            ]
+            first_free_index = [
+                i not in occupied for i in range(max_possible_batch_size)
+            ].index(True)
+            return first_free_index
+        return 0
+
+    def _get_mamba_cache_shape(
+            self
+    ) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+        conv_state_shape = (
+            self.config.mamba_expand * hidden_size // world_size,
+            self.config.mamba_d_conv,
+        )
+        temporal_state_shape = (
+            self.config.mamba_expand * self.config.hidden_size // world_size,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def _prepare_mamba_cache(self):
+        dtype = self.lm_head.weight.dtype
+        layers_type = self.config.layers_block_type
+        mamba_layers = sum(
+            [layer_type == "mamba" for layer_type in layers_type])
+        max_batch_size = _BATCH_SIZES_TO_CAPTURE[-1] + 10
+        conv_state_shape, temporal_state_shape = self._get_mamba_cache_shape()
+        assert conv_state_shape is not None and temporal_state_shape is not None
+        for buffername in ["mamba_cache", "mamba_gc_cache_buffer"]:
+            buffer = (torch.empty(size=(mamba_layers, max_batch_size) +
+                                  conv_state_shape,
+                                  dtype=dtype,
+                                  device="cuda"),
+                      torch.empty(size=(mamba_layers, max_batch_size) +
+                                  temporal_state_shape,
+                                  dtype=dtype,
+                                  device="cuda"))
+            setattr(self, buffername, buffer)
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            (
+                "ws" if weight_name in ["gate_proj", "up_proj"] else "w2s",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+            ) for expert_id in range(self.config.num_experts)
+            for weight_name in ["down_proj", "up_proj", "gate_proj"]
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if 'experts' in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index b036e76d7..7e08586cd 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -934,6 +934,8 @@ class ExecuteModelRequest:
     previous_hidden_states: Optional[HiddenStates] = None
     # The number of forward steps to run.
     num_steps: int = 1
+    # Finished request ids since last step.
+    finished_requests_ids: List[str] = field(default_factory=list)
 
     def clone(
         self, seq_group_metadata_list: List[SequenceGroupMetadata]
@@ -949,4 +951,4 @@ class ExecuteModelRequest:
             running_queue_size=self.running_queue_size,
             previous_hidden_states=self.previous_hidden_states,
             num_steps=self.num_steps,
-        )
+            finished_requests_ids=self.finished_requests_ids)
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index b4c953162..1c7b8c07e 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -75,15 +75,19 @@ class TP1DraftModelRunner(ModelRunner):
             List[SequenceGroupMetadata]] = None
 
     def prepare_model_input(
-            self,
-            seq_group_metadata_list: List[SequenceGroupMetadata],
-            virtual_engine: int = 0) -> ModelInputForGPUWithSamplingMetadata:
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForGPUWithSamplingMetadata:
         """A temporary solution that caches the seq_group_metadata_list
         for multi-step execution.
         TODO: In-place update model_input and remove this function.
         """
         self.cached_seq_group_metadata_list = seq_group_metadata_list
-        return super().prepare_model_input(seq_group_metadata_list)
+        return super().prepare_model_input(
+            seq_group_metadata_list,
+            finished_requests_ids=finished_requests_ids)
 
     def update_model_input(
             self, model_input: ModelInputForGPUWithSamplingMetadata,
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 891e74f8a..252440c7b 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -33,7 +33,9 @@ class CacheEngine:
         self.device_config = device_config
 
         self.head_size = model_config.get_head_size()
-        self.num_layers = model_config.get_num_layers(parallel_config)
+        # Models like Jamba, have mixed typed layers, E.g Mamba
+        self.num_attention_layers = model_config.get_num_attention_layers(
+            parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
 
         self.block_size = cache_config.block_size
@@ -75,7 +77,7 @@ class CacheEngine:
             num_blocks, self.block_size, self.num_kv_heads, self.head_size)
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
-        for _ in range(self.num_layers):
+        for _ in range(self.num_attention_layers):
             # null block in CpuGpuBlockAllocator requires at least that
             # block to be zeroed-out.
             # We zero-out everything for simplicity.
@@ -87,12 +89,12 @@ class CacheEngine:
         return kv_cache
 
     def swap_in(self, src_to_dst: torch.Tensor) -> None:
-        for i in range(self.num_layers):
+        for i in range(self.num_attention_layers):
             self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i],
                                           src_to_dst)
 
     def swap_out(self, src_to_dst: torch.Tensor) -> None:
-        for i in range(self.num_layers):
+        for i in range(self.num_attention_layers):
             self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i],
                                           src_to_dst)
 
@@ -107,11 +109,12 @@ class CacheEngine:
     ) -> int:
         head_size = model_config.get_head_size()
         num_heads = model_config.get_num_kv_heads(parallel_config)
-        num_layers = model_config.get_num_layers(parallel_config)
+        num_attention_layers = model_config.get_num_attention_layers(
+            parallel_config)
 
         key_cache_block = cache_config.block_size * num_heads * head_size
         value_cache_block = key_cache_block
-        total = num_layers * (key_cache_block + value_cache_block)
+        total = num_attention_layers * (key_cache_block + value_cache_block)
         if cache_config.cache_dtype == "auto":
             dtype = model_config.dtype
         else:
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index f46e9e8ab..fd6c2b854 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -314,9 +314,10 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
         )
 
     def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
+            self,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            virtual_engine: int = 0,
+            finished_requests_ids: Optional[List[str]] = None
     ) -> CPUModelInput:
         multi_modal_kwargs = None
         # NOTE: We assume that all sequences in the group are all prompts or
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index faf6e99ab..0e1bb1bfe 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -120,10 +120,11 @@ class EmbeddingModelRunner(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
     ) -> ModelInputForGPUWithPoolingMetadata:
         assert seq_group_metadata_list is not None
         model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list)
+            seq_group_metadata_list, finished_requests_ids)
         # Prepare PoolingMetadata.
         assert model_input.seq_lens is not None
         pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 28b447c0d..bd3028147 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -84,6 +84,8 @@ class ModelInputForGPU(ModelRunnerInputBase):
     lora_requests: Optional[Set[LoRARequest]] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+    request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
+    finished_requests_ids: Optional[List[str]] = None
     virtual_engine: int = 0
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
@@ -94,6 +96,8 @@ class ModelInputForGPU(ModelRunnerInputBase):
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
             "virtual_engine": self.virtual_engine,
+            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
+            "finished_requests_ids": self.finished_requests_ids,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -128,6 +132,8 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
             "virtual_engine": self.virtual_engine,
+            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
+            "finished_requests_ids": self.finished_requests_ids,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,
@@ -191,6 +197,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         ]
         self.graph_memory_pool: Optional[Tuple[
             int, int]] = None  # Set during graph capture.
+
+        self.has_seqlen_agnostic = model_config.contains_seqlen_agnostic_layers(
+            parallel_config)
+
         # When using CUDA graph, the input block tables must be padded to
         # max_seq_len_to_capture. However, creating the block table in
         # Python can be expensive. To optimize this, we cache the block table
@@ -317,6 +327,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
     def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
+        finished_requests_ids: Optional[List[str]] = None
     ) -> TModelInputForGPU:
         """Helper method to prepare the model input based on a given sequence
         group. Prepares metadata needed for the base model forward pass but not
@@ -347,6 +358,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         block_tables: List[List[int]] = []
         multi_modal_kwargs_list: Dict[str,
                                       List[torch.Tensor]] = defaultdict(list)
+        request_ids_to_seq_ids: Dict[str, List[int]] = defaultdict(list)
         decode_only = True
         num_prefills = 0
         num_prefill_tokens = 0
@@ -738,7 +750,11 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             k: torch.cat(v, dim=0).to(self.device)
             for k, v in multi_modal_kwargs_list.items()
         }
-
+        request_ids_to_seq_ids = {
+            seq_group_metadata.request_id:
+            list(seq_group_metadata.seq_data.keys())
+            for seq_group_metadata in seq_group_metadata_list
+        }
         return self._model_input_cls(
             input_tokens=input_tokens_tensor,
             input_positions=input_positions_tensor,
@@ -748,7 +764,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             lora_mapping=lora_mapping,
             lora_requests=lora_requests,
             multi_modal_kwargs=multi_modal_kwargs,
-        )
+            request_ids_to_seq_ids=request_ids_to_seq_ids,
+            finished_requests_ids=finished_requests_ids)
 
     @torch.inference_mode()
     def profile_run(self) -> None:
@@ -821,7 +838,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         # Run the model with the dummy inputs.
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        model_input = self.prepare_model_input(seqs)
+        finished_requests_ids = [seq.request_id for seq in seqs]
+        model_input = self.prepare_model_input(
+            seqs, finished_requests_ids=finished_requests_ids)
         intermediate_tensors = None
         if not get_pp_group().is_first_rank:
             intermediate_tensors = self.model.make_empty_intermediate_tensors(
@@ -1033,21 +1052,37 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                         graph_runner.flashinfer_decode_wrapper = \
                             decode_wrapper
 
-                    graph_runner.capture(
+                    capture_inputs = {
+                        "input_ids":
                         input_tokens[:batch_size],
+                        "positions":
                         input_positions[:batch_size],
+                        "hidden_or_intermediate_states":
                         hidden_or_intermediate_states[
                             virtual_engine]  # type: ignore
                         [:batch_size]
                         if hidden_or_intermediate_states[virtual_engine]
                         is not None else None,
+                        "intermediate_inputs":
                         intermediate_inputs[:batch_size]
                         if intermediate_inputs is not None else None,
+                        "kv_caches":
                         kv_caches[virtual_engine],
+                        "attn_metadata":
                         attn_metadata,
-                        memory_pool=self.graph_memory_pool,
-                        stream=graph_capture_context.stream,
-                    )
+                        "memory_pool":
+                        self.graph_memory_pool,
+                        "stream":
+                        graph_capture_context.stream
+                    }
+                    if self.has_seqlen_agnostic:
+                        # Only used by Mamba-based models CUDA graph atm (Jamba)
+                        capture_inputs.update({
+                            "seqlen_agnostic_capture_inputs":
+                            self.model.get_seqlen_agnostic_capture_inputs(
+                                batch_size)
+                        })
+                    graph_runner.capture(**capture_inputs)
                     self.graph_memory_pool = graph_runner.graph.pool()
                     self.graph_runners[virtual_engine][batch_size] = (
                         graph_runner)
@@ -1084,6 +1119,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
     ) -> ModelInputForGPUWithSamplingMetadata:
         """Prepare the model input based on a given sequence group, including
         metadata for the sampling step.
@@ -1099,7 +1135,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
         If cuda graph is required, this API automatically pads inputs.
         """
         model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list)
+            seq_group_metadata_list, finished_requests_ids)
         sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
                                                      model_input.seq_lens,
                                                      model_input.query_lens,
@@ -1175,6 +1211,10 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
             model_executable = self.model
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        seqlen_agnostic_kwargs = {
+            "finished_requests_ids": model_input.finished_requests_ids,
+            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
+        } if self.has_seqlen_agnostic else {}
         hidden_or_intermediate_states = model_executable(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
@@ -1182,7 +1222,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
             attn_metadata=model_input.attn_metadata,
             intermediate_tensors=intermediate_tensors,
             **multi_modal_kwargs,
-        )
+            **seqlen_agnostic_kwargs)
 
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
@@ -1305,6 +1345,7 @@ class CUDAGraphRunner:
                 "positions": positions,
                 "kv_caches": kv_caches,
                 "slot_mapping": attn_metadata.slot_mapping,
+                **kwargs,
             }
         else:
             self.input_buffers = {
@@ -1315,6 +1356,7 @@ class CUDAGraphRunner:
                 "seq_lens_tensor":
                 attn_metadata.decode_metadata.seq_lens_tensor,
                 "block_tables": attn_metadata.decode_metadata.block_tables,
+                **kwargs,
             }
         if intermediate_inputs is not None:
             self.input_buffers.update(intermediate_inputs.tensors)
@@ -1349,13 +1391,18 @@ class CUDAGraphRunner:
                 non_blocking=True)
             self.input_buffers["block_tables"].copy_(
                 attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        if "seqlen_agnostic_capture_inputs" in self.input_buffers:
+            self.model.copy_inputs_before_cuda_graphs(self.input_buffers,
+                                                      **kwargs)
         if intermediate_tensors is not None:
             for key in intermediate_tensors.tensors:
                 self.input_buffers[key].copy_(intermediate_tensors[key],
                                               non_blocking=True)
         # Run the graph.
         self.graph.replay()
-
+        if "seqlen_agnostic_capture_inputs" in self.input_buffers:
+            self.model.copy_outputs_after_cuda_graphs(self.input_buffers,
+                                                      **kwargs)
         # Return the output tensor.
         if get_pp_group().is_last_rank:
             return self.output_buffers["hidden_states"]
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index f66bb4662..bc0960fa1 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -139,6 +139,7 @@ class ModelRunnerBase(ABC, Generic[T]):
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None,
     ) -> T:
         """
         Prepare the inputs to ModelRunnerBase.execute_model from an execution
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index ab8e48528..8b96966be 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -177,6 +177,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
     ) -> ModelInputForNeuron:
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 118173a4c..b082f4534 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -234,7 +234,8 @@ class LocalOrDistributedWorkerBase(WorkerBase):
             model_input: ModelRunnerInputBase = (
                 self.model_runner.prepare_model_input(
                     execute_model_req.seq_group_metadata_list,
-                    execute_model_req.virtual_engine))
+                    execute_model_req.virtual_engine,
+                    execute_model_req.finished_requests_ids))
             num_steps = execute_model_req.num_steps
 
             if self.do_metadata_broadcast:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 73b771c43..e652f1b10 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -189,9 +189,10 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         ))
 
     def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
+            self,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            virtual_engine: int = 0,
+            finished_requests_ids: Optional[List[str]] = None
     ) -> ModelInputForXPU:
         multi_modal_input = None
         if self.is_driver_worker:
-- 
GitLab


From 482045ee77a49d69ab9464d9d727960890d950f1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 2 Jul 2024 20:12:22 -0700
Subject: [PATCH 244/376] [hardware][misc] introduce platform abstraction
 (#6080)

---
 tests/kernels/test_cutlass.py                 |  4 +--
 tests/quantization/utils.py                   |  4 +--
 .../ops/blocksparse_attention/interface.py    |  5 +--
 vllm/attention/ops/prefix_prefill.py          |  4 +--
 vllm/lora/punica.py                           |  4 +--
 .../compressed_tensors/compressed_tensors.py  |  4 +--
 .../model_executor/layers/quantization/fp8.py |  5 +--
 .../layers/quantization/gptq_marlin.py        |  4 +--
 .../layers/quantization/utils/marlin_utils.py |  4 +--
 vllm/model_executor/model_loader/loader.py    |  5 +--
 vllm/platforms/__init__.py                    | 18 ++++++++++
 vllm/platforms/cuda.py                        | 34 +++++++++++++++++++
 vllm/platforms/interface.py                   | 21 ++++++++++++
 vllm/platforms/rocm.py                        | 15 ++++++++
 vllm/utils.py                                 |  7 ----
 vllm/worker/worker.py                         |  4 +--
 16 files changed, 113 insertions(+), 29 deletions(-)
 create mode 100644 vllm/platforms/__init__.py
 create mode 100644 vllm/platforms/cuda.py
 create mode 100644 vllm/platforms/interface.py
 create mode 100644 vllm/platforms/rocm.py

diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 9cc18a0ea..d409df34e 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -8,13 +8,13 @@ import pytest
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.utils import get_device_capability_stateless
+from vllm.platforms import current_platform
 
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
-capability = get_device_capability_stateless()
+capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 5c1b5ad9b..65bb80ed7 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,7 +1,7 @@
 import torch
 
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import get_device_capability_stateless
+from vllm.platforms import current_platform
 
 
 def is_quant_method_supported(quant_method: str) -> bool:
@@ -9,7 +9,7 @@ def is_quant_method_supported(quant_method: str) -> bool:
     if not torch.cuda.is_available():
         return False
 
-    capability = get_device_capability_stateless()
+    capability = current_platform.get_device_capability()
     capability = capability[0] * 10 + capability[1]
     return (capability >=
             QUANTIZATION_METHODS[quant_method].get_min_capability())
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 637cfda21..e870a8e61 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -2,13 +2,14 @@ import math
 
 import torch
 
-from vllm.utils import get_device_capability_stateless, is_cpu, is_hip
+from vllm.platforms import current_platform
+from vllm.utils import is_cpu, is_hip
 
 from .utils import (dense_to_crow_col, get_head_sliding_step,
                     get_sparse_attn_mask)
 
 IS_COMPUTE_8_OR_ABOVE = (torch.cuda.is_available()
-                         and get_device_capability_stateless()[0] >= 8)
+                         and current_platform.get_device_capability()[0] >= 8)
 
 if IS_COMPUTE_8_OR_ABOVE:
     from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index ca9f28fcb..4cd4976ad 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -5,7 +5,7 @@ import torch
 import triton
 import triton.language as tl
 
-from vllm.utils import get_device_capability_stateless
+from vllm.platforms import current_platform
 
 if triton.__version__ >= "2.1.0":
 
@@ -685,7 +685,7 @@ if triton.__version__ >= "2.1.0":
                               alibi_slopes=None,
                               sliding_window=None):
 
-        cap = get_device_capability_stateless()
+        cap = current_platform.get_device_capability()
         BLOCK = 128 if cap[0] >= 8 else 64
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index f30b2c13f..64f87a4b2 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -5,14 +5,14 @@ from typing import Optional
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.utils import get_device_capability_stateless
+from vllm.platforms import current_platform
 
 
 def _check_punica_support():
     if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
         return
 
-    if get_device_capability_stateless() < (8, 0):
+    if current_platform.get_device_capability() < (8, 0):
         raise ImportError(
             "punica LoRA kernels require compute capability >= 8.0")
     else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 491396c3d..e88bbc361 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     find_first_name_or_class_match)
-from vllm.utils import get_device_capability_stateless
+from vllm.platforms import current_platform
 
 
 class CompressedTensorsConfig(QuantizationConfig):
@@ -85,7 +85,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         return []
 
     def _check_gptq_and_marlin_can_run(self):
-        capability = get_device_capability_stateless()
+        capability = current_platform.get_device_capability()
         capability = capability[0] * 10 + capability[1]
         if capability < 80:
             raise RuntimeError("The quantization config is not supported for ",
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index dc2ca35c6..6d942fa61 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -12,7 +12,8 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import get_device_capability_stateless, print_warning_once
+from vllm.platforms import current_platform
+from vllm.utils import print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -20,7 +21,7 @@ logger = init_logger(__name__)
 
 
 def cutlass_fp8_supported() -> bool:
-    capability = get_device_capability_stateless()
+    capability = current_platform.get_device_capability()
     capability = capability[0] * 10 + capability[1]
 
     return ops.cutlass_scaled_mm_supports_fp8(capability)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 97aae33f1..a6284d0ed 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.utils import get_device_capability_stateless
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -173,7 +173,7 @@ class GPTQMarlinConfig(QuantizationConfig):
             return False
 
         # If the capability of the device is too low, cannot convert.
-        major, minor = get_device_capability_stateless()
+        major, minor = current_platform.get_device_capability()
         device_capability = major * 10 + minor
         if device_capability < cls.get_min_capability():
             return False
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 6f4aa2d77..ecd29a80e 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -12,9 +12,9 @@ from vllm.model_executor.layers.quantization.utils.marlin_perms import (
     marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     get_pack_factor, quantize_weights, sort_weights)
-from vllm.utils import get_device_capability_stateless
+from vllm.platforms import current_platform
 
-__cuda_arch = get_device_capability_stateless()
+__cuda_arch = current_platform.get_device_capability()
 
 MARLIN_TILE = 16
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index b61ac7490..6f4dcf4a0 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -35,7 +35,8 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.models.interfaces import (supports_lora,
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import get_device_capability_stateless, is_tpu
+from vllm.platforms import current_platform
+from vllm.utils import is_tpu
 
 logger = init_logger(__name__)
 
@@ -46,7 +47,7 @@ def _get_quantization_config(
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config, load_config)
-        capability = get_device_capability_stateless()
+        capability = current_platform.get_device_capability()
         capability = capability[0] * 10 + capability[1]
         if capability < quant_config.get_min_capability():
             raise ValueError(
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
new file mode 100644
index 000000000..7309f7bf7
--- /dev/null
+++ b/vllm/platforms/__init__.py
@@ -0,0 +1,18 @@
+from typing import Optional
+
+import torch
+
+from .interface import Platform, PlatformEnum
+
+current_platform: Optional[Platform]
+
+if torch.version.cuda is not None:
+    from .cuda import CudaPlatform
+    current_platform = CudaPlatform()
+elif torch.version.hip is not None:
+    from .rocm import RocmPlatform
+    current_platform = RocmPlatform()
+else:
+    current_platform = None
+
+__all__ = ['Platform', 'PlatformEnum', 'current_platform']
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
new file mode 100644
index 000000000..b2ca75813
--- /dev/null
+++ b/vllm/platforms/cuda.py
@@ -0,0 +1,34 @@
+"""Code inside this file can safely assume cuda platform, e.g. importing
+pynvml. However, it should not initialize cuda context.
+"""
+
+from functools import lru_cache, wraps
+from typing import Tuple
+
+import pynvml
+
+from .interface import Platform, PlatformEnum
+
+
+def with_nvml_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+class CudaPlatform(Platform):
+    _enum = PlatformEnum.CUDA
+
+    @staticmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
new file mode 100644
index 000000000..2ac092c25
--- /dev/null
+++ b/vllm/platforms/interface.py
@@ -0,0 +1,21 @@
+import enum
+from typing import Tuple
+
+
+class PlatformEnum(enum.Enum):
+    CUDA = enum.auto()
+    ROCM = enum.auto()
+
+
+class Platform:
+    _enum: PlatformEnum
+
+    def is_cuda(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
+    def is_rocm(self) -> bool:
+        return self._enum == PlatformEnum.ROCM
+
+    @staticmethod
+    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+        raise NotImplementedError
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
new file mode 100644
index 000000000..36b3ba8f7
--- /dev/null
+++ b/vllm/platforms/rocm.py
@@ -0,0 +1,15 @@
+from functools import lru_cache
+from typing import Tuple
+
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class RocmPlatform(Platform):
+    _enum = PlatformEnum.ROCM
+
+    @staticmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+        return torch.cuda.get_device_capability(device_id)
diff --git a/vllm/utils.py b/vllm/utils.py
index 1977bc05d..763b0b91c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -866,13 +866,6 @@ def is_full_nvlink(device_ids: List[int]) -> bool:
     return True
 
 
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_device_capability_stateless(device_id: int = 0) -> Tuple[int, int]:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
-
-
 #From: https://stackoverflow.com/a/4104188/2749989
 def run_once(f):
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 5b5728290..b25f29f48 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -15,8 +15,8 @@ from vllm.distributed import (ensure_model_parallel_initialized,
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_device_capability_stateless
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -333,7 +333,7 @@ def init_worker_distributed_environment(
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
     if torch_dtype == torch.bfloat16:
-        compute_capability = get_device_capability_stateless()
+        compute_capability = current_platform.get_device_capability()
         if compute_capability[0] < 8:
             gpu_name = torch.cuda.get_device_name()
             raise ValueError(
-- 
GitLab


From 9831aec49fd541a8fe6cb65f0c3a65b03eccffe0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 11:34:00 +0800
Subject: [PATCH 245/376] [Core] Dynamic image size support for VLMs (#5276)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: ywang96 <ywang@roblox.com>
Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 .../input_processing/model_inputs_index.rst   |   2 +-
 .../multimodal/adding_multimodal_model.rst    | 124 ++++++++++
 .../dev/multimodal/multimodal_index.rst       |  18 +-
 docs/source/models/vlm.rst                    |  24 +-
 examples/llava_example.py                     |   3 +-
 examples/llava_next_example.py                |  11 +-
 examples/phi3v_example.py                     |   8 +-
 tests/conftest.py                             | 114 ++++++---
 .../distributed/test_multimodal_broadcast.py  |   7 +-
 tests/models/test_llava.py                    | 106 ++++++---
 tests/models/test_llava_next.py               | 129 ++++++----
 tests/models/test_phi3v.py                    | 140 ++++++-----
 tests/models/utils.py                         |  88 +++++--
 tests/multimodal/test_mapper.py               |  63 ++---
 tests/multimodal/test_utils.py                |  29 ++-
 vllm/config.py                                |  12 +-
 vllm/entrypoints/openai/serving_chat.py       |  97 ++++----
 vllm/entrypoints/openai/serving_engine.py     |   1 +
 vllm/inputs/registry.py                       |   3 +-
 vllm/model_executor/models/clip.py            |  37 +++
 vllm/model_executor/models/llava.py           |  49 ++--
 vllm/model_executor/models/llava_next.py      | 184 +++++++++------
 vllm/model_executor/models/phi3v.py           | 222 +++++++++++++-----
 vllm/model_executor/models/utils.py           |  41 ++++
 vllm/multimodal/__init__.py                   |   7 +-
 vllm/multimodal/base.py                       | 101 ++++++--
 vllm/multimodal/image.py                      | 100 +++++++-
 vllm/multimodal/registry.py                   |  33 ++-
 vllm/multimodal/utils.py                      |  95 +++++---
 vllm/sequence.py                              |   2 +-
 vllm/transformers_utils/image_processor.py    |  18 +-
 vllm/worker/cpu_model_runner.py               |  34 ++-
 vllm/worker/embedding_model_runner.py         |   5 +-
 vllm/worker/model_runner.py                   |  27 ++-
 vllm/worker/neuron_model_runner.py            |  35 ++-
 vllm/worker/openvino_model_runner.py          |  38 ++-
 vllm/worker/tpu_model_runner.py               |  44 +++-
 vllm/worker/xpu_model_runner.py               |  70 ++++--
 38 files changed, 1455 insertions(+), 666 deletions(-)
 create mode 100644 docs/source/dev/multimodal/adding_multimodal_model.rst
 create mode 100644 vllm/model_executor/models/utils.py

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
index 594edeb74..2dde251aa 100644
--- a/docs/source/dev/input_processing/model_inputs_index.rst
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -8,7 +8,7 @@ Input Processing
 vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
 in :class:`~vllm.LLMEngine` before they are passed to model executors. 
 
-Currently, this mechanism is only utilized in **multi-modal models** for preprocessing multi-modal input 
+Currently, this mechanism is only utilized in :ref:`multi-modal models <multi_modality>` for preprocessing multi-modal input 
 data in addition to input prompt, but it can be extended to text-only language models when needed.
 
 Guides
diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
new file mode 100644
index 000000000..0e9590639
--- /dev/null
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -0,0 +1,124 @@
+.. _adding_a_new_multimodal_model:
+
+Adding a New Multimodal Model
+=============================
+
+This document provides a high-level guide on integrating a :ref:`multi-modal model <multi_modality>` into vLLM.
+
+.. note::
+    The complexity of adding a new model depends heavily on the model's architecture.
+    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+
+.. tip::
+    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
+    We will be happy to help you out!
+
+
+1. Set up the base vLLM model
+-----------------------------
+
+As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model in vLLM, but note the following:
+
+- You should additionally implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
+
+  .. code-block:: diff
+
+      + from vllm.model_executor.models.interfaces import SupportsVision
+
+      - class YourModelForImage2Seq(nn.Module):
+      + class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+  .. note::
+      The model class does not have to be named :code:`*ForCausalLM`.
+      Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
+
+- While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
+  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  .. code-block:: diff
+
+      def forward(
+          self,
+          input_ids: torch.Tensor,
+          positions: torch.Tensor,
+          kv_caches: List[torch.Tensor],
+          attn_metadata: AttentionMetadata,
+      +   pixel_values: torch.Tensor,
+      ) -> SamplerOutput:
+
+
+2. Register input mappers
+-------------------------
+
+For each modality type to support, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
+
+.. code-block:: diff
+
+    from vllm.model_executor.models.interfaces import SupportsVision
+    + from vllm.multimodal import MULTIMODAL_REGISTRY
+
+    + @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    + @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+
+.. seealso::
+    :ref:`input_processing_pipeline`
+
+
+3. (Optional) Register dummy data
+---------------------------------
+
+During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
+In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+
+.. code-block:: diff
+
+    from vllm.inputs import INPUT_REGISTRY
+    from vllm.model_executor.models.interfaces import SupportsVision
+    from vllm.multimodal import MULTIMODAL_REGISTRY
+
+    @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+    + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+Here are some examples:
+
+- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`
+
+
+4. (Optional) Register input processor
+--------------------------------------
+
+Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. 
+This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call.
+You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+
+.. code-block:: diff
+
+    from vllm.inputs import INPUT_REGISTRY
+    from vllm.model_executor.models.interfaces import SupportsVision
+    from vllm.multimodal import MULTIMODAL_REGISTRY
+
+    @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+    @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
+Here are some examples:
+
+- Insert static number of image tokens: `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Insert dynamic number of image tokens: `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 4d5fb3246..d01f39284 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -1,3 +1,5 @@
+.. _multi_modality:
+
 Multi-Modality
 ==============
 
@@ -8,12 +10,18 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 :class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
 which allows you to pass in multi-modal input alongside text and token prompts.
 
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`InputRegistry.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input_mapper <MultiModalRegistry.register_input_mapper>` for each modality type to support.
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`the guide for adding a new multimodal model. <adding_a_new_multimodal_model>`.
 
 # TODO: Add more instructions on how to do that once embeddings is in.
 
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   adding_multimodal_model
+
 Module Contents
 +++++++++++++++
 
@@ -35,6 +43,10 @@ Base Classes
     :members:
     :show-inheritance:
 
+.. autoclass:: vllm.multimodal.MultiModalInputs
+    :members:
+    :show-inheritance:
+
 .. autoclass:: vllm.multimodal.MultiModalPlugin
     :members:
     :show-inheritance:
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 053f5b860..f8c61018a 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -23,7 +23,6 @@ The following :ref:`engine arguments <engine_args>` are specific to VLMs:
     Currently, the support for vision language models on vLLM has the following limitations:
 
     * Only single image input is supported per text prompt.
-    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means our LLaVA-NeXT output may not exactly match the huggingface implementation.
 
     We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
 
@@ -42,12 +41,17 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
     )
 
 .. important::
+    Currently, you have to specify ``image_feature_size`` to support memory profiling.
+    To avoid OOM during runtime, you should set this to the maximum value supported by the model.
+    The calculation of feature size is specific to the model. For more details, please refer to
+    the function :code:`get_<model_name>_image_feature_size` inside the corresponding model file.
+
     We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
 
 
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
-* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
+* ``prompt``: The prompt should follow the format that is documented on HuggingFace.
 * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
 
 .. note::
@@ -57,8 +61,8 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
 .. code-block:: python
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     # Load the image using PIL.Image
     image = ...
@@ -74,8 +78,6 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
 
-.. important::
-    We will remove the need to format image tokens in a future release. Afterwards, the input text will follow the same format as that for the original HuggingFace model.
 
 Online OpenAI Vision API Compatible Inference
 ----------------------------------------------
@@ -103,6 +105,11 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
         --chat-template template_llava.jinja
 
 .. important::
+    Currently, you have to specify ``image_feature_size`` to support memory profiling.
+    To avoid OOM during runtime, you should set this to the maximum value supported by the model.
+    The calculation of feature size is specific to the model. For more details, please refer to
+    the function :code:`get_<model_name>_image_feature_size` inside the corresponding model file.
+
     We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
 
 To consume the server, you can use the OpenAI client like in the example below:
@@ -121,6 +128,8 @@ To consume the server, you can use the OpenAI client like in the example below:
         messages=[{
             "role": "user",
             "content": [
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
+                # since the prompt will be processed automatically by the API server.
                 {"type": "text", "text": "What's in this image?"},
                 {
                     "type": "image_url",
@@ -144,5 +153,4 @@ A full code example can be found in `examples/openai_vision_api_client.py <https
         export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 .. note::
-    The prompt formatting with the image token ``<image>`` is not needed when serving VLMs with the API server since the prompt will be 
-    processed automatically by the server.
+    There is no need to format the prompt in the API request since it will be handled by the server.
diff --git a/examples/llava_example.py b/examples/llava_example.py
index 7f3d84f99..f5cb2a661 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -17,8 +17,7 @@ def run_llava():
         image_feature_size=576,
     )
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     image = Image.open("images/stop_sign.jpg")
 
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
index 3c39590e7..20d4791ff 100644
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -5,22 +5,17 @@ from PIL import Image
 
 from vllm import LLM, SamplingParams
 
-# Dynamic image input is currently not supported and therefore
-# a fixed image input shape and its corresponding feature size is required.
-# See https://github.com/vllm-project/vllm/pull/4199 for the complete
-# configuration matrix.
-
 
 def run_llava_next():
     llm = LLM(
         model="llava-hf/llava-v1.6-mistral-7b-hf",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
-        image_feature_size=1176,
+        # Use the maximum possible value for memory profiling
+        image_feature_size=2928,
     )
 
-    prompt = "[INST] " + "<image>" * 1176 + (
-        "\nWhat is shown in this image? [/INST]")
+    prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
     url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
     image = Image.open(BytesIO(requests.get(url).content))
     sampling_params = SamplingParams(temperature=0.8,
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 7d6c58d7f..0aabfee6a 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -5,6 +5,9 @@ from PIL import Image
 
 from vllm import LLM, SamplingParams
 
+# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
+# You can use `.buildkite/download-images.sh` to download them
+
 
 def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
@@ -18,7 +21,8 @@ def run_phi3v():
         trust_remote_code=True,
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
-        image_feature_size=1921,
+        # Use the maximum possible value for memory profiling
+        image_feature_size=2653,
         max_num_seqs=5,
     )
 
@@ -26,8 +30,6 @@ def run_phi3v():
 
     # single-image prompt
     prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n"  # noqa: E501
-    prompt = prompt.replace("<|image_1|>", "<|image|>" * 1921 + "<s>")
-
     sampling_params = SamplingParams(temperature=0, max_tokens=64)
 
     outputs = llm.generate(
diff --git a/tests/conftest.py b/tests/conftest.py
index fd088d566..608a5f49d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,13 @@
 import contextlib
 import gc
 import os
+import sys
 from collections import UserList
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple,
-                    TypedDict, TypeVar)
+from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
+                    TypeVar)
 
 import pytest
 import torch
@@ -22,13 +23,10 @@ from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
+from vllm.multimodal.utils import fetch_image
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
-if TYPE_CHECKING:
-    # it will call torch.cuda.device_count()
-    from vllm.multimodal import MultiModalDataDict
-
 logger = init_logger(__name__)
 
 _TEST_DIR = os.path.dirname(__file__)
@@ -47,30 +45,42 @@ def _read_prompts(filename: str) -> List[str]:
 
 @dataclass(frozen=True)
 class ImageAsset:
-    name: Literal["stop_sign", "cherry_blossom"]
+    name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
 
     @cached_property
     def pil_image(self) -> Image.Image:
-        return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
-
-    def for_hf(self) -> Image.Image:
-        return self.pil_image
+        if self.name == "boardwalk":
+            return fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+            )
 
-    def for_vllm(self) -> Dict[str, Any]:
-        return {"image": self.pil_image}
+        return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
 
 
 class _ImageAssetPrompts(TypedDict):
     stop_sign: str
     cherry_blossom: str
+    boardwalk: str
+
+
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _ImageAssetsBase(UserList):
+        pass
+else:
 
+    class _ImageAssetsBase(UserList[ImageAsset]):
+        pass
 
-class _ImageAssets(UserList):
+
+class _ImageAssets(_ImageAssetsBase):
 
     def __init__(self) -> None:
-        super().__init__(
-            [ImageAsset("stop_sign"),
-             ImageAsset("cherry_blossom")])
+        super().__init__([
+            ImageAsset("stop_sign"),
+            ImageAsset("cherry_blossom"),
+            ImageAsset("boardwalk")
+        ])
 
     def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         """
@@ -79,7 +89,10 @@ class _ImageAssets(UserList):
         The order of the returned prompts matches the order of the
         assets when iterating through this object.
         """
-        return [prompts["stop_sign"], prompts["cherry_blossom"]]
+        return [
+            prompts["stop_sign"], prompts["cherry_blossom"],
+            prompts["boardwalk"]
+        ]
 
 
 IMAGE_ASSETS = _ImageAssets()
@@ -220,7 +233,7 @@ class HfRunner:
         self,
         prompts: List[str],
         images: Optional[List[Image.Image]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images:
             assert len(prompts) == len(images)
@@ -255,7 +268,7 @@ class HfRunner:
         prompts: List[str],
         max_tokens: int,
         images: Optional[List[Image.Image]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
@@ -291,19 +304,30 @@ class HfRunner:
         self,
         prompts: List[str],
         max_tokens: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
     ) -> List[List[torch.Tensor]]:
-        all_logprobs = []
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        all_logprobs: List[List[torch.Tensor]] = []
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+
             output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
                 return_dict_in_generate=True,
+                **kwargs,
             )
-            seq_logprobs = []
+            seq_logprobs: List[torch.Tensor] = []
             for hidden_states in output.hidden_states:
                 last_hidden_states = hidden_states[-1][0]
                 logits = torch.matmul(
@@ -323,20 +347,32 @@ class HfRunner:
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
         all_logprobs: List[List[Dict[int, float]]] = []
         all_output_ids: List[List[int]] = []
         all_output_strs: List[str] = []
 
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+            input_ids = inputs.input_ids
+
             output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
                 return_dict_in_generate=True,
+                **kwargs,
             )
 
             seq_logprobs: List[torch.Tensor] = []
@@ -431,7 +467,7 @@ class VllmRunner:
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[List["MultiModalDataDict"]] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -439,7 +475,7 @@ class VllmRunner:
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = image
+                inputs[i]["multi_modal_data"] = {"image": image}
 
         req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
@@ -462,10 +498,19 @@ class VllmRunner:
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         assert sampling_params.logprobs is not None
 
-        req_outputs = self.model.generate(prompts,
+        if images is not None:
+            assert len(prompts) == len(images)
+
+        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
+        if images is not None:
+            for i, image in enumerate(images):
+                inputs[i]["multi_modal_data"] = {"image": image}
+
+        req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
         outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
         for req_output in req_outputs:
@@ -480,7 +525,7 @@ class VllmRunner:
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[List["MultiModalDataDict"]] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)
@@ -492,11 +537,14 @@ class VllmRunner:
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                 max_tokens=max_tokens,
                                                 logprobs=num_logprobs)
-        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+        outputs = self.generate_w_logprobs(prompts,
+                                           greedy_logprobs_params,
+                                           images=images)
 
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
index 41c3fd9e7..1d143a852 100644
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -30,9 +30,10 @@ else:
 @pytest.mark.parametrize("tensor_parallel_size", [2])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets,
-                tensor_parallel_size: int, dtype: str,
-                max_tokens: int) -> None:
+                tensor_parallel_size: int, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(
             f"Need at least {tensor_parallel_size} GPUs to run the test.")
@@ -44,8 +45,10 @@ def test_models(hf_runner, vllm_runner, image_assets,
         vllm_runner,
         image_assets,
         model_and_config=model_and_vl_config[0],
+        size_factors=[1.0],
         dtype=dtype,
         max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
         tensor_parallel_size=tensor_parallel_size,
         distributed_executor_backend=distributed_executor_backend,
     )
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index c6313c52e..2f4b85bc1 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -4,18 +4,21 @@ import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
-# The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
-    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
     "cherry_blossom":
-    "<image>\nUSER: What is the season?\nASSISTANT:",
+    "USER: <image>\nWhat is the season?\nASSISTANT:",
+    "boardwalk":
+    "USER: <image>\nWhat's in this image?\nASSISTANT:",
 })
 
 
@@ -37,27 +40,34 @@ model_and_vl_config = [
 ]
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    output_ids, output_str = vllm_output
+    output_ids, output_str, out_logprobs = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
+    eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
+
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, "")
+    assert hf_output_str[0] == " "
+    hf_output_str = hf_output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
 
-    return hf_output_ids, hf_output_str
+    return hf_output_ids, hf_output_str, out_logprobs
 
 
 def run_test(
@@ -66,8 +76,10 @@ def run_test(
     image_assets: _ImageAssets,
     model_and_config: Tuple[str, VisionLanguageConfig],
     *,
+    size_factors: List[float],
     dtype: str,
     max_tokens: int,
+    num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ):
@@ -81,61 +93,85 @@ def run_test(
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
-    hf_images = [asset.for_hf() for asset in image_assets]
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
+    # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
                      dtype=dtype,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-
-        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
-        # we must put it inside the vllm_runner context manager
-        # i.e. after creating vLLM instance.
-        vllm_images = [asset.for_vllm() for asset in image_assets]
-
-        vllm_image_prompts = [
-            p.replace("<image>", "<image>" * vlm_config.image_feature_size)
-            for p in HF_IMAGE_PROMPTS
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
         ]
 
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
-
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
-
-    check_outputs_equal(
-        hf_outputs,
-        [
-            vllm_to_hf_output(vllm_output, vlm_config, model_id)
-            for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-    )
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int) -> None:
+                size_factors, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
     run_test(
         hf_runner,
         vllm_runner,
         image_assets,
         model_and_config,
+        size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
         tensor_parallel_size=1,
     )
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index e9babba13..8817f41a6 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,12 +1,15 @@
-from typing import List, Tuple
+import re
+from typing import List, Optional, Tuple
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS
-from .utils import check_outputs_equal
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
@@ -15,21 +18,20 @@ _PREFACE = (
     "The assistant gives helpful, detailed, and polite answers to the human's "
     "questions.")
 
-# The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
-    f"{_PREFACE} <image>\nUSER: What's the content of the image?\nASSISTANT:",
+    f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:",
     "cherry_blossom":
-    f"{_PREFACE} <image>\nUSER: What is the season?\nASSISTANT:",
+    f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
+    "boardwalk":
+    f"{_PREFACE} USER: <image>\nWhat's in this image? ASSISTANT:",
 })
 
 
 def iter_llava_next_configs(model_name: str):
+    # Need to use the max possible feature size for profile_run
     image_hw_to_feature_size = {
-        (336, 336): 1176,
-        (672, 672): 2928,
-        (1344, 336): 1944,
-        (336, 1344): 1890,
+        (336, 336): 2928,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -47,37 +49,55 @@ model_and_vl_config = [
 ]
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    output_ids, output_str = vllm_output
+    output_ids, output_str, out_logprobs = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
+    eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, " ")
 
-    return hf_output_ids, hf_output_str
+    hf_output_str = re.sub(fr"({image_token_str})+", "", output_str)
+    assert hf_output_str[0] == " "
+    hf_output_str = hf_output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
 
 
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int) -> None:
+                size_factors, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -88,37 +108,46 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
-    hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm() for asset in image_assets]
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model_id,
+                     dtype=dtype,
+                     max_model_len=4096,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
-
-    vllm_image_prompts = [
-        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
-        for p in HF_IMAGE_PROMPTS
-    ]
-
-    with vllm_runner(
-            model_id,
-            dtype=dtype,
-            # should be greater than image_feature_size
-            max_model_len=4096,
-            enforce_eager=True,
-            **vlm_config.as_cli_args_dict(),
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
-
-    check_outputs_equal(
-        hf_outputs,
-        [
-            vllm_to_hf_output(vllm_output, vlm_config, model_id)
-            for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-    )
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 917bdbf94..f144f9755 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,29 +1,33 @@
+import re
 from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
-# The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
     "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
     "cherry_blossom":
-    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
+    "boardwalk":
+    "<|user|>\n<|image_1|>\nWhat's in this image?<|end|>\n<|assistant|>\n",
 })
 
 
 def iter_phi3v_configs(model_name: str):
+    # Need to use the max possible feature size for profile_run
     image_hw_to_feature_size = {
-        (1008, 1344): 1921,
-        (2016, 2688): 1933,
+        (1008, 1344): 2653,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -39,29 +43,29 @@ model_and_vl_config = [
 ]
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    output_ids, output_str = vllm_output
-    image_token_id = vlm_config.image_token_id
+    output_ids, output_str, out_logprobs = vllm_output
 
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
-
-    hf_output_ids = [
-        token_id if token_id != image_token_id else 0
-        for idx, token_id in enumerate(output_ids)
-    ]
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, "") \
-        .replace("<s>", " ").replace("<|user|>", "") \
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
+
+    hf_output_str = output_str_without_image.replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
 
-    return hf_output_ids, hf_output_str
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
 
 
 target_dtype = "half"
@@ -75,8 +79,10 @@ def run_test(
     image_assets: _ImageAssets,
     model_and_config: Tuple[str, VisionLanguageConfig],
     *,
+    size_factors: List[float],
     dtype: str,
     max_tokens: int,
+    num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ):
@@ -90,73 +96,91 @@ def run_test(
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
-    hf_images = [asset.for_hf() for asset in image_assets]
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
+    # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
-                     max_model_len=2048,
+                     max_model_len=4096,
                      dtype=dtype,
                      tensor_parallel_size=tensor_parallel_size,
-                     enforce_eager=True,
                      distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
-        # we must put it inside the vllm_runner context manager
-        # i.e. after creating vLLM instance.
-
-        vllm_images = [asset.for_vllm() for asset in image_assets]
-
-        vllm_image_prompts = [
-            p.replace("<|image_1|>",
-                      "<|image|>" * vlm_config.image_feature_size + "<s>")
-            for p in HF_IMAGE_PROMPTS
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images)
+            for prompts, vllm_images in inputs_per_image
         ]
 
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
-
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
     with hf_runner(model_id, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(
-            HF_IMAGE_PROMPTS,
-            max_tokens,
-            images=hf_images,
-            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
-
-    check_outputs_equal(
-        hf_outputs,
-        [
-            vllm_to_hf_output(vllm_output, vlm_config, model_id)
-            for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-    )
-
+        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
+            for prompts, hf_images in inputs_per_image
+        ]
 
-# Since we use _attn_implementation="eager" for hf_runner, here is
-# numeric difference for longer context and test can't pass
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+# Since we use _attn_implementation="eager" for hf_runner, there is more
+# significant numerical difference. The basic `logprobs=5` fails to pass.
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int) -> None:
+                size_factors, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
     run_test(
         hf_runner,
         vllm_runner,
         image_assets,
         model_and_config,
+        size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
         tensor_parallel_size=1,
     )
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 0d5e304d8..51d57129d 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,11 +1,18 @@
-from typing import Dict, List, Tuple
+import warnings
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+from vllm.sequence import SampleLogprobs
 
 TokensText = Tuple[List[int], str]
 
 
-def check_outputs_equal(outputs_0_lst: List[TokensText],
-                        outputs_1_lst: List[TokensText], name_0: str,
-                        name_1: str):
+def check_outputs_equal(
+    *,
+    outputs_0_lst: Sequence[TokensText],
+    outputs_1_lst: Sequence[TokensText],
+    name_0: str,
+    name_1: str,
+):
     """
     Compare the two sequences generated by different models, 
     which should be equal.
@@ -18,20 +25,28 @@ def check_outputs_equal(outputs_0_lst: List[TokensText],
         output_ids_0, output_str_0 = outputs_0
         output_ids_1, output_str_1 = outputs_1
 
-        assert output_str_0 == output_str_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
-        assert output_ids_0 == output_ids_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
+        # The text and token outputs should exactly match
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
+
+        assert output_str_0 == output_str_1, fail_msg
+        assert output_ids_0 == output_ids_1, fail_msg
 
 
-TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
+TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
+                                                                    float]],
+                                                          SampleLogprobs]]]
 
 
-def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
-                         outputs_1_lst: List[TokensTextLogprobs], name_0: str,
-                         name_1: str):
+def check_logprobs_close(
+    *,
+    outputs_0_lst: Sequence[TokensTextLogprobs],
+    outputs_1_lst: Sequence[TokensTextLogprobs],
+    name_0: str,
+    name_1: str,
+    warn_on_mismatch: bool = True,
+):
     """
     Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.
@@ -45,21 +60,52 @@ def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
         output_ids_0, output_str_0, logprobs_0 = outputs_0
         output_ids_1, output_str_1, logprobs_1 = outputs_1
 
+        if logprobs_0 is None:
+            logprobs_0 = [None] * len(output_ids_0)
+        if logprobs_1 is None:
+            logprobs_1 = [None] * len(output_ids_1)
+
         # Loop through generated tokens.
         for idx, (output_id_0,
                   output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
 
             # If generated tokens don't match, then
             if output_id_0 != output_id_1:
+                logprobs_elem_0 = logprobs_0[idx]
+                logprobs_elem_1 = logprobs_1[idx]
+
                 # Each predicted token must be in top N logprobs of the other
-                assert output_id_0 in logprobs_1[idx], (
-                    f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
-                assert output_id_1 in logprobs_0[idx], (
+                fail_msg = (
                     f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
+                    f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
+                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
+
+                assert logprobs_elem_0 is not None, fail_msg
+                assert logprobs_elem_1 is not None, fail_msg
+                assert output_id_0 in logprobs_elem_1, fail_msg
+                assert output_id_1 in logprobs_elem_0, fail_msg
+
+                if warn_on_mismatch:
+                    with warnings.catch_warnings():
+                        # This ensures that repeated warnings are shown
+                        # in the output, not just the first occurrence
+                        warnings.simplefilter("always")
+
+                        warnings.warn(fail_msg, stacklevel=2)
 
                 # Break out since sequences will now diverge.
                 break
+        else:
+            if output_str_0 != output_str_1 and warn_on_mismatch:
+                # The token outputs exactly match,
+                # so the text outputs should exactly match as well
+                fail_msg = (f"Test{prompt_idx}:"
+                            f"\n{name_0}:\t{output_str_0!r}"
+                            f"\n{name_1}:\t{output_str_1!r}")
+
+                with warnings.catch_warnings():
+                    # This ensures that repeated warnings are shown
+                    # in the output, not just the first occurrence
+                    warnings.simplefilter("always")
+
+                    warnings.warn(fail_msg, stacklevel=2)
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index bdbbd9abf..321566ad5 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -4,12 +4,12 @@ from transformers import CLIPImageProcessor, LlavaNextImageProcessor
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-
-from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
+from vllm.multimodal.utils import rescale_image_size
 
 
 @pytest.mark.parametrize("dtype", ["half", "float"])
-def test_clip_image_processor(image_assets, dtype):
+@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
+def test_clip_image_processor(image_assets, dtype, size_factor):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
 
     hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
@@ -26,13 +26,15 @@ def test_clip_image_processor(image_assets, dtype):
     )
 
     for asset in image_assets:
+        image = rescale_image_size(asset.pil_image, size_factor)
+
         hf_result = hf_processor.preprocess(
-            asset.pil_image,
+            image,
             return_tensors="pt",
-        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
+        )
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            {"image": asset.pil_image},
+            {"image": image},
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -44,12 +46,10 @@ def test_clip_image_processor(image_assets, dtype):
             assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
 
 
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
 @pytest.mark.parametrize("dtype", ["half", "float"])
-def test_llava_next_image_processor(image_assets, dtype):
-    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
+@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
+def test_llava_next_image_processor(image_assets, dtype, size_factor):
+    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
 
     hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
     assert isinstance(hf_processor, LlavaNextImageProcessor)
@@ -65,13 +65,15 @@ def test_llava_next_image_processor(image_assets, dtype):
     )
 
     for asset in image_assets:
+        image = rescale_image_size(asset.pil_image, size_factor)
+
         hf_result = hf_processor.preprocess(
-            asset.pil_image,
+            image,
             return_tensors="pt",
-        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
+        )
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            {"image": asset.pil_image},
+            {"image": image},
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -81,36 +83,3 @@ def test_llava_next_image_processor(image_assets, dtype):
 
             assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
             assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
-@pytest.mark.xfail(
-    reason="Example image pixels were not processed using HuggingFace")
-@pytest.mark.parametrize("dtype", ["float"])
-def test_image_pixel_types(image_assets, dtype):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-    )
-    for asset in image_assets:
-        image_result = MULTIMODAL_REGISTRY.map_input(
-            model_config,
-            {"image": asset.pil_image},
-        )
-        tensor_result = MULTIMODAL_REGISTRY.map_input(
-            model_config,
-            {"image": asset.pil_image},
-        )
-
-        assert image_result.keys() == tensor_result.keys()
-        for key, image_arr in image_result.items():
-            tensor_arr: np.ndarray = tensor_result[key].numpy()
-
-            assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
-            assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 5a6395ac9..10cabdadb 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -5,10 +5,9 @@ from typing import Dict, Tuple
 
 import numpy as np
 import pytest
-import pytest_asyncio
 from PIL import Image
 
-from vllm.multimodal.utils import ImageFetchAiohttp
+from vllm.multimodal.utils import ImageFetchAiohttp, fetch_image
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
@@ -19,12 +18,9 @@ TEST_IMAGE_URLS = [
 ]
 
 
-@pytest_asyncio.fixture(scope="session")
-async def url_images() -> Dict[str, Image.Image]:
-    return {
-        image_url: await ImageFetchAiohttp.fetch_image(image_url)
-        for image_url in TEST_IMAGE_URLS
-    }
+@pytest.fixture(scope="module")
+def url_images() -> Dict[str, Image.Image]:
+    return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
 
 
 def get_supported_suffixes() -> Tuple[str, ...]:
@@ -41,7 +37,15 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
     return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
 
 
-@pytest.mark.asyncio
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_fetch_image_http(image_url: str):
+    image_sync = fetch_image(image_url)
+    image_async = await ImageFetchAiohttp.fetch_image(image_url)
+    assert _image_equals(image_sync, image_async)
+
+
+@pytest.mark.asyncio(scope="module")
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
@@ -68,8 +72,11 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
         base64_image = base64.b64encode(f.read()).decode("utf-8")
         data_url = f"data:{mime_type};base64,{base64_image}"
 
-        data_image = await ImageFetchAiohttp.fetch_image(data_url)
+        data_image_sync = fetch_image(data_url)
         if _image_equals(url_image, Image.open(f)):
-            assert _image_equals(url_image, data_image)
+            assert _image_equals(url_image, data_image_sync)
         else:
             pass  # Lossy format; only check that image can be opened
+
+        data_image_async = await ImageFetchAiohttp.fetch_image(data_url)
+        assert _image_equals(data_image_sync, data_image_async)
diff --git a/vllm/config.py b/vllm/config.py
index 8c449323f..de8e119c9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -5,7 +5,7 @@ from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple,
                     Union)
 
 import torch
-from transformers import PretrainedConfig, PreTrainedTokenizerBase
+from transformers import PretrainedConfig
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -1303,16 +1303,6 @@ class VisionLanguageConfig:
     image_input_shape: tuple
     image_feature_size: int
 
-    #TODO(ywang96): make this a cached property once we refactor the
-    # VisionLanguageConfig class.
-    def get_image_token_text(
-            self, tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
-        """Get the image token placeholder text to be inserted into the 
-        text prompt and the string representation of the image token id.
-        """
-        image_token_str = tokenizer.decode(self.image_token_id)
-        return image_token_str * self.image_feature_size, image_token_str
-
     def as_cli_args_dict(self) -> Dict[str, Any]:
         """Flatten vision language config to pure args.
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index e5b6b7f57..57ad7bdd3 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,6 +1,7 @@
 import codecs
 import time
 from dataclasses import dataclass, field
+from functools import cached_property
 from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable,
                     List, Optional)
 from typing import Sequence as GenericSequence
@@ -10,7 +11,7 @@ from fastapi import Request
 from openai.types.chat import (ChatCompletionContentPartImageParam,
                                ChatCompletionContentPartTextParam)
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionContentPartParam, ChatCompletionLogProb,
@@ -27,8 +28,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import (async_get_and_parse_image,
-                                   get_full_image_text_prompt)
+from vllm.multimodal.utils import async_get_and_parse_image
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -97,6 +97,36 @@ class OpenAIServingChat(OpenAIServing):
             logger.warning(
                 "No chat template provided. Chat API will not work.")
 
+    @cached_property
+    def image_token_str(self) -> Optional[str]:
+        # TODO: Let user specify how to insert image tokens into prompt
+        # (similar to chat template)
+        model_type = self.model_config.hf_config.model_type
+        if model_type == "phi3_v":
+            # Workaround since this token is not defined in the tokenizer
+            return "<|image_1|>"
+        if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv",
+                          "paligemma"):
+            # These models do not use image tokens in the prompt
+            return None
+
+        # The default behaviour assumes that the image token is
+        # available to the tokenizer.
+        # (Suitable for LLaVA, Idefics2, DeepSeek-VL)
+        vlm_config = self.model_config.multimodal_config
+        if vlm_config is None:
+            raise ValueError(
+                "'image_url' input is not supported as the loaded "
+                "model is not multimodal.")
+
+        image_token_id = vlm_config.image_token_id
+        if vlm_config.image_token_id is None:
+            raise ValueError(
+                "'image_url' input is not supported as the loaded "
+                "model does not specify an image token.")
+
+        return self.tokenizer.decode(image_token_id)
+
     def _parse_chat_message_content_parts(
         self,
         role: str,
@@ -105,21 +135,26 @@ class OpenAIServingChat(OpenAIServing):
         texts: List[str] = []
         mm_futures: List[Awaitable[MultiModalDataDict]] = []
 
-        vlm_config: Optional[VisionLanguageConfig] = getattr(
-            self.engine.engine, "vision_language_config", None)
-        model_config = getattr(self.engine.engine, "model_config", None)
-
         for part in parts:
             part_type = part["type"]
             if part_type == "text":
                 text = cast(ChatCompletionContentPartTextParam, part)["text"]
                 texts.append(text)
             elif part_type == "image_url":
-                if vlm_config is None:
-                    raise ValueError(
-                        "'image_url' input is not supported as the loaded "
-                        "model is not multimodal.")
-                assert self.tokenizer is not None
+                if len(mm_futures) > 0:
+                    raise NotImplementedError(
+                        "Multiple 'image_url' input is currently not supported."
+                    )
+
+                image_token_str = self.image_token_str
+                if image_token_str is not None:
+                    if any(image_token_str in text for text in texts):
+                        logger.warning(
+                            "Detected image token string in the text prompt. "
+                            "Skipping prompt formatting.")
+                    else:
+                        texts.append(image_token_str)
+
                 image_url = cast(ChatCompletionContentPartImageParam,
                                  part)["image_url"]
 
@@ -128,43 +163,13 @@ class OpenAIServingChat(OpenAIServing):
                         "'image_url.detail' is currently not supported and "
                         "will be ignored.")
 
-                mm_future = async_get_and_parse_image(image_url["url"])
-                mm_futures.append(mm_future)
-
+                image_future = async_get_and_parse_image(image_url["url"])
+                mm_futures.append(image_future)
             else:
                 raise NotImplementedError(f"Unknown part type: {part_type}")
 
         text_prompt = "\n".join(texts)
-
-        if vlm_config is not None and len(mm_futures):
-
-            assert len(
-                mm_futures
-            ) == 1, "Multiple 'image_url' input is currently not supported."
-            (image_token_prompt,
-             image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
-
-            # NOTE: If image token string (e.g, <image>) is already present
-            # in the text prompt, we assume it follows the same format required
-            # by the engine.
-            if image_token_str in text_prompt:
-                logger.warning(
-                    "Detected image token string in the text prompt. "
-                    "Skipping prompt formatting.")
-                messages = [
-                    ConversationMessage(role=role, content=text_prompt)
-                ]
-
-            else:
-                full_prompt = get_full_image_text_prompt(
-                    image_prompt=image_token_prompt,
-                    text_prompt=text_prompt,
-                    config=model_config)
-                messages = [
-                    ConversationMessage(role=role, content=full_prompt)
-                ]
-        else:
-            messages = [ConversationMessage(role=role, content=text_prompt)]
+        messages = [ConversationMessage(role=role, content=text_prompt)]
 
         return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
 
@@ -267,7 +272,7 @@ class OpenAIServingChat(OpenAIServing):
             "prompt": prompt_text,
             "prompt_token_ids": prompt_ids,
         }
-        if mm_data is not None:
+        if mm_data:
             inputs["multi_modal_data"] = mm_data
 
         is_tracing_enabled = await self.engine.is_tracing_enabled()
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 84e412772..8d281c51f 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -36,6 +36,7 @@ class OpenAIServing:
         super().__init__()
 
         self.engine = engine
+        self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
         # A separate tokenizer to map token IDs to strings.
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 3e2873338..936909eb3 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -140,7 +140,8 @@ class InputRegistry:
 
         The model is identified by ``model_config``.
 
-        TODO: Add guide [ref: PR #5276]
+        See also:
+            :ref:`adding_a_new_multimodal_model`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 5212e2808..4533e8cbd 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -8,10 +8,14 @@ from PIL import Image
 from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 
+from vllm.config import ModelConfig
+from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal.image import (cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
 from vllm.sequence import SequenceData
 
 
@@ -64,6 +68,39 @@ def dummy_image_for_clip(
     return {"image": image}
 
 
+def input_processor_for_clip(
+    model_config: ModelConfig,
+    hf_config: CLIPVisionConfig,
+    llm_inputs: LLMInputs,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        tokenizer,
+        llm_inputs.get("prompt"),
+        llm_inputs["prompt_token_ids"],
+        image_token_id=image_token_id,
+        repeat_count=image_feature_size,
+    )
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
 class CLIPVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index bbec4dbd8..2588d8b06 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -6,7 +6,7 @@ from transformers import CLIPVisionConfig, LlavaConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY, InputContext
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -20,8 +20,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
-from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   input_processor_for_clip)
 from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
 
 _KEYS_TO_MODIFY_MAPPING = {
     "language_model.lm_head": "lm_head",
@@ -51,28 +53,10 @@ class LlavaMultiModalProjector(nn.Module):
         return hidden_states
 
 
-def merge_vision_embeddings(input_ids: torch.Tensor,
-                            inputs_embeds: torch.Tensor,
-                            vision_embeddings: torch.Tensor,
-                            image_token_id: int) -> torch.Tensor:
-    """In place merges in vision_embeddings with inputs_embeds."""
-    mask = (input_ids == image_token_id)
-
-    image_feature_size = vision_embeddings.shape[0] * vision_embeddings.shape[1]
-    if mask.sum() != image_feature_size:
-        raise ValueError(f"image_feature_size should be {image_feature_size}, "
-                         f"but found: {mask.sum()}")
-
-    inputs_embeds[mask] = vision_embeddings.view(image_feature_size,
-                                                 vision_embeddings.shape[-1])
-
-    return inputs_embeds
-
-
 class LlavaImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: (batch_size, num_channels, height, width)"""
+    """Shape: `(batch_size, num_channels, height, width)`"""
 
 
 LlavaImageInputs = LlavaImagePixelInputs
@@ -96,8 +80,30 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int):
     raise NotImplementedError(msg)
 
 
+def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return input_processor_for_clip(
+            model_config,
+            vision_config,
+            llm_inputs,
+            image_token_id=hf_config.image_token_index,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
 class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
@@ -112,7 +118,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
         # TODO: Optionally initializes this for supporting embeddings.
         self.vision_tower = CLIPVisionModel(config.vision_config)
-
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index f67598c40..92604cdf3 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,4 +1,4 @@
-from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -10,7 +10,7 @@ from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY, InputContext
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -21,13 +21,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
-                   get_clip_patch_grid_length)
+                   get_clip_patch_grid_length, input_processor_for_clip)
 from .interfaces import SupportsVision
-from .llava import LlavaMultiModalProjector, merge_vision_embeddings
+from .llava import LlavaMultiModalProjector
+from .utils import merge_vision_embeddings
 
 logger = init_logger(__name__)
 
@@ -39,16 +40,27 @@ _KEYS_TO_MODIFY_MAPPING = {
 
 class LlavaNextImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different for each batch.
+    """
 
     image_sizes: NotRequired[torch.Tensor]
-    """Shape: (batch_size, 2)"""
+    """
+    Shape: `(batch_size, 2)`
+
+    This should be in `(height, width)` format.
+    """
 
 
 LlavaNextImageInputs = LlavaNextImagePixelInputs
 
 
+# Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91
+# NOTE: new_height and new_width are further incremented to properly invert the
+# floordiv operation: https://github.com/huggingface/transformers/blob/v4.42.2/src/transformers/models/llava_next/modeling_llava_next.py#L133
 def _get_llava_next_num_unpadded_features(
     height: int,
     width: int,
@@ -56,7 +68,6 @@ def _get_llava_next_num_unpadded_features(
     num_patch_height: int,
     num_patch_width: int,
 ) -> Tuple[int, int]:
-    # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
     current_height = npatches * num_patch_height
     current_width = npatches * num_patch_width
 
@@ -64,9 +75,13 @@ def _get_llava_next_num_unpadded_features(
     current_aspect_ratio: float = current_width / current_height
     if aspect_ratio > current_aspect_ratio:
         new_height = (height * current_width) // width
+        if new_height % 2 == 1:
+            new_height += 1
         current_height = new_height
     else:
         new_width = (width * current_height) // height
+        if new_width % 2 == 1:
+            new_width += 1
         current_width = new_width
 
     unpadded_features = current_height * current_width
@@ -74,7 +89,8 @@ def _get_llava_next_num_unpadded_features(
     return (unpadded_features, newline_features)
 
 
-def _get_llava_next_image_feature_size(
+# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L111
+def get_llava_next_image_feature_size(
     hf_config: LlavaNextConfig,
     *,
     input_height: int,
@@ -89,7 +105,9 @@ def _get_llava_next_image_feature_size(
         )
         base_feature_size = num_patches * num_patches
 
-        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+        # Note: We follow the "wrong" width/height order
+        # [ref: PR huggingface/transformers#31588]
+        num_patch_width, num_patch_height = get_anyres_image_grid_shape(
             image_size=(input_height, input_width),
             grid_pinpoints=hf_config.image_grid_pinpoints,
             patch_size=vision_config.image_size,
@@ -110,14 +128,16 @@ def _get_llava_next_image_feature_size(
 
 
 def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
-    multimodal_config = ctx.get_multimodal_config()
     hf_config = ctx.get_hf_config(LlavaNextConfig)
     vision_config = hf_config.vision_config
 
-    #TODO: change the logic for dummy data to support dynamic shape
-    _, _, dummy_height, dummy_width = multimodal_config.image_input_shape
-    image_feature_size = _get_llava_next_image_feature_size(
-        hf_config, input_height=dummy_height, input_width=dummy_width)
+    # Result in the max possible feature size (2x2 grid of 336x336px tiles)
+    dummy_height = dummy_width = 448
+    image_feature_size = get_llava_next_image_feature_size(
+        hf_config,
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
 
     if isinstance(vision_config, CLIPVisionConfig):
         seq_data = dummy_seq_data_for_clip(
@@ -139,27 +159,47 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     raise NotImplementedError(msg)
 
 
-def _pixel_mapper(ctx: InputContext, image: object) -> Dict[str, torch.Tensor]:
+def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
 
-    if isinstance(image, Image.Image):
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(LlavaNextConfig)
+    vision_config = hf_config.vision_config
 
-        # Temporary patch before dynamic number of image tokens is supported
-        _, _, h, w = ctx.get_multimodal_config().image_input_shape
-        if (w, h) != (image.width, image.height):
-            logger.warning(
-                "Dynamic image shape is currently not supported. "
-                "Resizing input image to (%d, %d).", w, h)
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        width, height = image_data.size
+
+        image_feature_size = get_llava_next_image_feature_size(
+            hf_config,
+            input_height=height,
+            input_width=width,
+        )
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
 
-            image = image.resize((w, h))
+    vision_config = hf_config.vision_config
 
-        return MULTIMODAL_REGISTRY._get_plugin("image") \
-            ._default_input_mapper(ctx, image)
+    if isinstance(vision_config, CLIPVisionConfig):
+        return input_processor_for_clip(
+            model_config,
+            vision_config,
+            llm_inputs,
+            image_token_id=hf_config.image_token_index,
+            image_feature_size_override=image_feature_size,
+        )
 
-    raise TypeError(f"Invalid type for 'image': {type(image)}")
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(_pixel_mapper)
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
@@ -172,8 +212,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         self.config = config
         self.vlm_config = vlm_config
 
+        # TODO: Optionally initializes this for supporting embeddings.
         self.vision_tower = CLIPVisionModel(config=config.vision_config)
-
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
@@ -196,24 +236,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
 
-    def _validate_image_pixels(self, data: torch.Tensor) -> torch.Tensor:
-        _, num_channels, _, _ = self.vlm_config.image_input_shape
-
-        # Note that this is different from that of vLLM vision_language_config
-        # since the image is resized by the HuggingFace preprocessor
-        height = width = self.config.vision_config.image_size
-
-        if list(data.shape[2:]) != [num_channels, height, width]:
-            raise ValueError(
-                f"The expected image tensor shape is batch dimension plus "
-                f"num_patches plus {[num_channels, height, width]}. "
-                f"You supplied {data.shape}. "
-                f"If you are using vLLM's entrypoint, make sure your "
-                f"supplied image input is consistent with "
-                f"image_input_shape in engine args.")
-
-        return data
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         if list(data.shape[1:]) != [2]:
             raise ValueError(
@@ -223,14 +245,14 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         return data
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
+            self, **kwargs: object) -> Optional[LlavaNextImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        if pixel_values is None or image_sizes is None:
+        if pixel_values is None:
             return None
 
-        if not isinstance(pixel_values, torch.Tensor):
+        if not isinstance(pixel_values, (torch.Tensor, list)):
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
 
@@ -240,7 +262,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
         return LlavaNextImagePixelInputs(
             type="pixel_values",
-            data=self._validate_image_pixels(pixel_values),
+            data=pixel_values,
             image_sizes=self._validate_image_sizes(image_sizes),
         )
 
@@ -267,15 +289,14 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
             strategy=self.config.vision_feature_select_strategy,
         )
 
+    # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
     def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                                       patch_embeddings: torch.Tensor, *,
                                       strategy: str) -> torch.Tensor:
-        # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
         if strategy == "flat":
             return patch_embeddings.flatten(0, 1)
 
         if strategy.startswith("spatial"):
-            orig_width, orig_height = image_size
             height = width = self.config.vision_config.image_size \
                 // self.config.vision_config.patch_size
 
@@ -289,13 +310,15 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
                 other_patch_embeds = patch_embeddings[1:]
 
                 # image_aspect_ratio == "anyres"
+                # Note: We follow the "wrong" width/height order
+                # [ref: PR huggingface/transformers#31588]
                 num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-                    (orig_width, orig_height),
+                    image_size,
                     self.config.image_grid_pinpoints,
                     self.config.vision_config.image_size,
                 )
                 other_patch_embeds = other_patch_embeds \
-                    .view(num_patch_width, num_patch_height, height, width, -1)
+                    .view(num_patch_height, num_patch_width, height, width, -1)
 
                 if "unpad" in strategy:
                     other_patch_embeds = other_patch_embeds \
@@ -333,44 +356,53 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         raise ValueError(f"Unexpected patch merge strategy: {strategy}")
 
     def _process_image_pixels(
-            self, inputs: LlavaNextImagePixelInputs) -> torch.Tensor:
+        self,
+        inputs: LlavaNextImagePixelInputs,
+    ) -> BatchedTensors:
         assert self.vision_tower is not None
 
         pixel_values = inputs["data"]
 
-        b, num_patches, c, h, w = pixel_values.shape
-        stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+        if isinstance(pixel_values, torch.Tensor):
+            b, num_patches, c, h, w = pixel_values.shape
+            stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+            stacked_image_features = self._image_pixels_to_features(
+                self.vision_tower, stacked_pixel_values)
+            stacked_patch_embeddings = self.multi_modal_projector(
+                stacked_image_features)
 
+            return stacked_patch_embeddings.view(
+                b, num_patches, *stacked_patch_embeddings.shape[1:])
+
+        num_patches_per_batch = [v.shape[0] for v in pixel_values]
+        stacked_pixel_values = torch.cat(pixel_values)
         stacked_image_features = self._image_pixels_to_features(
             self.vision_tower, stacked_pixel_values)
 
-        return stacked_image_features.view(b, num_patches,
-                                           *stacked_image_features.shape[-2:])
+        return [
+            self.multi_modal_projector(image_features) for image_features in
+            torch.split(stacked_image_features, num_patches_per_batch)
+        ]
 
     def _process_image_input(
-            self, image_input: LlavaNextImageInputs) -> torch.Tensor:
-        assert self.vision_tower is not None
-        image_features = self._process_image_pixels(image_input)
-
-        patch_embeddings = self.multi_modal_projector(image_features)
+            self, image_input: LlavaNextImageInputs) -> BatchedTensors:
+        patch_embeddings = self._process_image_pixels(image_input)
 
         image_sizes = image_input.get("image_sizes")
         if image_sizes is None:
-            batch_size = image_input["data"].shape[0]
+            batch_size = len(image_input["data"])
             vision_config = self.config.vision_config
-            default_width = default_height = vision_config.image_size
-            image_sizes = torch.as_tensor([[default_width, default_height]
+            default_height = default_width = vision_config.image_size
+            image_sizes = torch.as_tensor([[default_height, default_width]
                                            for _ in range(batch_size)])
 
-        merged_patch_embeddings = [
+        return [
             self._merge_image_patch_embeddings(image_sizes[i],
-                                               patch_features,
+                                               patch_features_batch,
                                                strategy="spatial_unpad")
-            for i, patch_features in enumerate(patch_embeddings)
+            for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
-        return torch.stack(merged_patch_embeddings, dim=0)
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -404,8 +436,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each grid patch for each input image.
-                Expects a batch with shape `[1, num_patches, 3, 336, 336]`.
-            image_sizes: The original `(width, height)` for each input image.
+                Expects a batch with shape `[1, num_patches, 3, h, w]`.
+            image_sizes: The original `(height, width)` for each input image.
                 Expects a batch with shape `[1, 2]`.
 
         See also:
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d73a42026..3d247c9ed 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -13,7 +13,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
+import re
+from functools import lru_cache
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import numpy as np
 import torch
@@ -22,8 +24,8 @@ from PIL import Image
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY, InputContext
+from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -34,10 +36,12 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal.image import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
-from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   input_processor_for_clip)
 from .interfaces import SupportsVision
 
 logger = init_logger(__name__)
@@ -251,50 +255,22 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
 
 class Phi3VImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
-
-    image_sizes: torch.Tensor
-    """Shape: (batch_size, 2)"""
-
-
-def _get_phi3v_image_feature_size(
-    *,
-    input_height: int,
-    input_width: int,
-) -> int:
-    h, w = input_height, input_width
-
-    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L178
-    return (h // 336 * w // 336 + 1) * 144 + 1 + (h // 336 + 1) * 12
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 
+    Note that `num_patches` may be different for each batch.
+    """
 
-def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
-    multimodal_config = ctx.get_multimodal_config()
-
-    #TODO: change the logic for dummy data to support dynamic shape
-    _, _, dummy_height, dummy_width = multimodal_config.image_input_shape
-    image_feature_size = _get_phi3v_image_feature_size(
-        input_height=dummy_height,
-        input_width=dummy_width,
-    )
-
-    seq_data = dummy_seq_data_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        seq_len,
-        image_token_id=32044,
-        image_feature_size_override=image_feature_size,
-    )
-    mm_data = dummy_image_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        image_width_override=dummy_width,
-        image_height_override=dummy_height,
-    )
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size, 2)`
 
-    return seq_data, mm_data
+    This should be in `(height, width)` format.
+    """
 
 
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57
 def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
     target_height = int(np.ceil(height / padding_unit) * padding_unit)
     top_padding = int((target_height - height) / 2)
@@ -304,7 +280,7 @@ def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
     return padded_width, padded_height
 
 
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L90
 def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
     transposed = False
     if width < height:
@@ -329,27 +305,133 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
     return padded_width, padded_height
 
 
-def _image_processor(ctx: InputContext,
-                     image: object) -> Dict[str, torch.Tensor]:
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L181
+def get_phi3v_image_feature_size(
+    hf_config: PretrainedConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    num_crops = getattr(hf_config, "num_crops", 16)
+    new_width, new_height = _calc_hd_transform_size(width=input_width,
+                                                    height=input_height,
+                                                    hd_num=num_crops)
 
-    if isinstance(image, Image.Image):
-        # Temporary patch before dynamic number of image tokens is supported
-        _, _, h, w = ctx.get_multimodal_config().image_input_shape
-        if (w, h) != _calc_hd_transform_size(width=image.width,
-                                             height=image.height):
-            logger.warning(
-                "Dynamic image shape is currently not supported. "
-                "Resizing input image to (%d, %d).", w, h)
+    return (new_height // 336 * new_width // 336 + 1) * 144 + 1 \
+        + (new_height // 336 + 1) * 12
 
-            image = image.resize((w, h))
 
-        return MULTIMODAL_REGISTRY._get_plugin("image") \
-                ._default_input_mapper(ctx, image)
-    raise TypeError(f"Invalid type for 'image': {type(image)}")
+def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
+    # Result in the max possible feature size (h:w = 16:1)
+    dummy_height, dummy_width = 8000, 50
+    image_feature_size = get_phi3v_image_feature_size(
+        ctx.get_hf_config(PretrainedConfig),
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
+
+    seq_data = dummy_seq_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        seq_len,
+        image_token_id=32044,
+        image_feature_size_override=image_feature_size,
+    )
+    mm_data = dummy_image_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        image_width_override=dummy_width,
+        image_height_override=dummy_height,
+    )
+
+    return seq_data, mm_data
+
 
+# Reserve this function to also handle placeholders for additional images
+# [ref: PR #5820]
+@lru_cache
+def _get_image_placeholder_token_ids(model_config: ModelConfig,
+                                     idx: int) -> List[int]:
+    assert idx > 0
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(_image_processor)
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    # We need to get the token for "<", not "▁<"
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
+    a_token_id, = tokenizer.encode("a", add_special_tokens=False)
+    a_token_id_, *image_placeholder_token_ids = tokenizer.encode(
+        f"a<|image_{idx}|>", add_special_tokens=False)
+    assert a_token_id == a_token_id_
+
+    return image_placeholder_token_ids
+
+
+def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        w, h = image_data.size
+        w, h = _calc_hd_transform_size(width=w, height=h)
+
+        image_feature_size = get_phi3v_image_feature_size(hf_config,
+                                                          input_width=w,
+                                                          input_height=h)
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    prompt = llm_inputs.get("prompt")
+    if prompt is None:
+        new_prompt = None
+    else:
+        if prompt.count("<|image|>") > 0:
+            logger.warning("Please follow the prompt format that is "
+                           "documented on HuggingFace which does not involve "
+                           "repeating <|image|> tokens.")
+        elif len(re.findall(r"(<\|image_\d+\|>)+", prompt)) > 1:
+            logger.warning("Multiple image input is not supported yet, "
+                           "so any extra image tokens will be treated "
+                           "as plain text.")
+
+        new_prompt = prompt
+
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    image_1_token_ids = _get_image_placeholder_token_ids(model_config, idx=1)
+
+    new_token_ids: List[int] = []
+    for i in range(len(prompt_token_ids) - len(image_1_token_ids) + 1):
+        if prompt_token_ids[i:i + len(image_1_token_ids)] == image_1_token_ids:
+            new_token_ids.append(multimodal_config.image_token_id)
+
+            # No need to further scan the list since we only replace once
+            new_token_ids.extend(prompt_token_ids[i + len(image_1_token_ids):])
+            break
+        else:
+            new_token_ids.append(prompt_token_ids[i])
+
+    # NOTE: Create a defensive copy of the original inputs
+    llm_inputs = LLMInputs(prompt_token_ids=new_token_ids,
+                           prompt=new_prompt,
+                           multi_modal_data=multi_modal_data)
+
+    return input_processor_for_clip(
+        model_config,
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        llm_inputs,
+        image_token_id=multimodal_config.image_token_id,
+        image_feature_size_override=image_feature_size,
+    )
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
 class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,
@@ -363,6 +445,8 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
         self.vlm_config = vlm_config
 
         self.model = LlamaModel(config, cache_config, quant_config)
+
+        # TODO: Optionally initializes this for supporting embeddings.
         self.vision_embed_tokens = Phi3HDImageEmbedding(
             vlm_config, config, self.model.embed_tokens)
         self.lm_head = ParallelLMHead(config.vocab_size,
@@ -376,12 +460,20 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        if pixel_values is not None and image_sizes is not None:
-            return Phi3VImagePixelInputs(type="pixel_values",
-                                         data=pixel_values,
-                                         image_sizes=image_sizes)
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        if not isinstance(image_sizes, torch.Tensor):
+            raise ValueError("Incorrect type of image sizes. "
+                             f"Got type: {type(image_sizes)}")
 
-        return None
+        return Phi3VImagePixelInputs(type="pixel_values",
+                                     data=pixel_values,
+                                     image_sizes=image_sizes)
 
     def forward(self,
                 input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
new file mode 100644
index 000000000..ef2562b07
--- /dev/null
+++ b/vllm/model_executor/models/utils.py
@@ -0,0 +1,41 @@
+import torch
+
+from vllm.multimodal import BatchedTensors
+
+
+def merge_vision_embeddings(input_ids: torch.Tensor,
+                            inputs_embeds: torch.Tensor,
+                            vision_embeddings: BatchedTensors,
+                            image_token_id: int) -> torch.Tensor:
+    """
+    Merge `vision_embeddings` into `inputs_embeds` by overwriting the positions
+    in `inputs_embeds` corresponding to placeholder image tokens in `input_ids`.
+
+    Note:
+        This updates `inputs_embeds` in place.
+    """
+    mask = (input_ids == image_token_id)
+    num_expected_tokens = mask.sum()
+
+    if isinstance(vision_embeddings, torch.Tensor):
+        batch_size, batch_tokens, *_, embed_dim = vision_embeddings.shape
+        total_tokens = batch_size * batch_tokens
+        if num_expected_tokens != total_tokens:
+            expr = f"{batch_size} x {batch_tokens}"
+            raise ValueError(
+                f"Attempted to assign {expr} = {total_tokens} "
+                f"image tokens to {num_expected_tokens} placeholders")
+
+        inputs_embeds[mask] = vision_embeddings.view(total_tokens, embed_dim)
+    else:
+        size_per_batch = [t.shape[0] for t in vision_embeddings]
+        total_tokens = sum(size_per_batch)
+        if num_expected_tokens != total_tokens:
+            expr = ' + '.join(map(str, size_per_batch))
+            raise ValueError(
+                f"Attempted to assign {expr} = {total_tokens} "
+                f"image tokens to {num_expected_tokens} placeholders")
+
+        inputs_embeds[mask] = torch.cat(vision_embeddings)
+
+    return inputs_embeds
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 256eadd2d..b6d930659 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
-from .base import MultiModalDataDict, MultiModalPlugin
+from .base import (BatchedTensors, MultiModalDataDict, MultiModalInputs,
+                   MultiModalPlugin)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -11,8 +12,10 @@ See also:
 """
 
 __all__ = [
+    "BatchedTensors",
+    "MultiModalDataDict",
+    "MultiModalInputs",
     "MultiModalPlugin",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
-    "MultiModalDataDict",
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 558cd1175..e7b45649d 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,23 +1,90 @@
+import sys
 from abc import ABC, abstractmethod
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Type,
-                    TypedDict, TypeVar, Union)
+from collections import UserDict, defaultdict
+from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict,
+                    TypeVar, Union)
+
+import torch
+import torch.types
+from PIL import Image
+from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
 
-if TYPE_CHECKING:
-    import torch
-    from PIL import Image
-    from torch import nn
-
 logger = init_logger(__name__)
 
-N = TypeVar("N", bound=Type["nn.Module"])
+BatchedTensors = Union[torch.Tensor, List[torch.Tensor]]
+"""
+If each input tensor in the batch has the same size, this is a single batched
+tensor; otherwise, this is a list of tensors with one element per batch.
+"""
+
+if sys.version_info < (3, 9):
+    # UserDict cannot be subscripted
+    class _MultiModalInputsBase(UserDict):
+        pass
+else:
+
+    class _MultiModalInputsBase(UserDict[str, torch.Tensor]):
+        pass
+
+
+class MultiModalInputs(_MultiModalInputsBase):
+    """
+    A dictionary that represents the keyword arguments to
+    :meth:`~torch.nn.Module.forward`.
+    """
+
+    @staticmethod
+    def try_concat(
+        tensors: List[torch.Tensor],
+        *,
+        device: torch.types.Device,
+    ) -> BatchedTensors:
+        # Avoid initializing CUDA too early
+        import torch
+
+        unbatched_shape = tensors[0].shape[1:]
+
+        for tensor in tensors:
+            if tensor.shape[1:] != unbatched_shape:
+                return [
+                    tensor.squeeze(0).to(device=device) for tensor in tensors
+                ]
+
+        return torch.cat(tensors, dim=0).to(device=device)
+
+    @staticmethod
+    def batch(
+        inputs_list: List["MultiModalInputs"],
+        device: torch.types.Device,
+    ) -> Dict[str, BatchedTensors]:
+        """Batch multiple inputs together into a dictionary."""
+        if len(inputs_list) == 0:
+            return {}
+
+        keys = inputs_list[0].keys()
+
+        item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list)
+
+        for inputs in inputs_list:
+            if inputs.keys() != keys:
+                msg = f"Inputs do not share the same keys ({keys})"
+                raise ValueError(msg)
+
+            for k, v in inputs.items():
+                item_lists[k].append(v)
+
+        return {
+            k: MultiModalInputs.try_concat(item_list, device=device)
+            for k, item_list in item_lists.items()
+        }
 
 
 class MultiModalDataBuiltins(TypedDict, total=False):
-    image: "Image.Image"
+    image: Image.Image
 
 
 MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
@@ -29,12 +96,13 @@ to the model by the corresponding mapper. By default, the mapper of
 the corresponding plugin with the same modality key is applied.
 """
 
-MultiModalInputMapper = Callable[[InputContext, object], Dict[str,
-                                                              "torch.Tensor"]]
+MultiModalInputMapper = Callable[[InputContext, object], MultiModalInputs]
 """Return a dictionary to be passed as keyword arguments to
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
 
+N = TypeVar("N", bound=Type[nn.Module])
+
 
 class MultiModalPlugin(ABC):
     """
@@ -48,8 +116,7 @@ class MultiModalPlugin(ABC):
     """
 
     def __init__(self) -> None:
-        self._input_mappers: Dict[Type["nn.Module"],
-                                  MultiModalInputMapper] = {}
+        self._input_mappers: Dict[Type[nn.Module], MultiModalInputMapper] = {}
 
     @abstractmethod
     def get_data_key(self) -> str:
@@ -60,7 +127,7 @@ class MultiModalPlugin(ABC):
 
     @abstractmethod
     def _default_input_mapper(self, ctx: InputContext,
-                              data: object) -> Dict[str, "torch.Tensor"]:
+                              data: object) -> MultiModalInputs:
         """Return a dictionary to be passed as keyword arguments to
         :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
@@ -80,6 +147,7 @@ class MultiModalPlugin(ABC):
 
         See also:
             :ref:`input_processing_pipeline`
+            :ref:`adding_a_new_multimodal_model`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -97,7 +165,7 @@ class MultiModalPlugin(ABC):
         return wrapper
 
     def map_input(self, model_config: ModelConfig,
-                  data: object) -> Dict[str, "torch.Tensor"]:
+                  data: object) -> MultiModalInputs:
         """
         Apply an input mapper to a data passed
         to the model, transforming the data into a dictionary of model inputs.
@@ -106,7 +174,8 @@ class MultiModalPlugin(ABC):
 
         The model is identified by ``model_config``.
 
-        TODO: Add guide [ref: PR #5276]
+        See also:
+            :ref:`adding_a_new_multimodal_model`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index a0b4206bf..dfef33121 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,19 +1,102 @@
 from functools import lru_cache
-from typing import Dict
+from typing import List, Optional, Tuple, TypeVar
 
 import torch
 from PIL import Image
+from transformers import PreTrainedTokenizerBase
 
 from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.image_processor import get_image_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from .base import MultiModalPlugin
+from .base import MultiModalInputs, MultiModalPlugin
 
 logger = init_logger(__name__)
 
 cached_get_image_processor = lru_cache(get_image_processor)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+# Utilities for image input processors
+_T = TypeVar("_T", str, int)
+
+
+def repeat_and_pad_token(
+    token: _T,
+    *,
+    repeat_count: int = 1,
+    pad_token_left: Optional[_T] = None,
+    pad_token_right: Optional[_T] = None,
+) -> List[_T]:
+    replacement = [token] * repeat_count
+    if pad_token_left is not None:
+        replacement = [pad_token_left] + replacement
+    if pad_token_right is not None:
+        replacement = replacement + [pad_token_right]
+
+    return replacement
+
+
+def repeat_and_pad_image_tokens(
+    tokenizer: PreTrainedTokenizerBase,
+    prompt: Optional[str],
+    prompt_token_ids: List[int],
+    *,
+    image_token_id: int,
+    repeat_count: int = 1,
+    pad_token_left: Optional[int] = None,
+    pad_token_right: Optional[int] = None,
+) -> Tuple[Optional[str], List[int]]:
+    if prompt is None:
+        new_prompt = None
+    else:
+        image_token_str = tokenizer.decode(image_token_id)
+        pad_token_str_left = (None if pad_token_left is None else
+                              tokenizer.decode(pad_token_left))
+        pad_token_str_right = (None if pad_token_right is None else
+                               tokenizer.decode(pad_token_right))
+        replacement_str = "".join(
+            repeat_and_pad_token(
+                image_token_str,
+                repeat_count=repeat_count,
+                pad_token_left=pad_token_str_left,
+                pad_token_right=pad_token_str_right,
+            ))
+
+        image_token_count = prompt.count(image_token_str)
+        # This is an arbitrary number to distinguish between the two cases
+        if image_token_count > 16:
+            logger.warning(
+                "Please follow the prompt format that is "
+                "documented on HuggingFace which does not involve "
+                "repeating %s tokens.", image_token_str)
+        elif image_token_count > 1:
+            logger.warning("Multiple image input is not supported yet, "
+                           "so any extra image tokens will be treated "
+                           "as plain text.")
+
+        # The image tokens are removed to be consistent with HuggingFace
+        new_prompt = prompt.replace(image_token_str, replacement_str, 1)
+
+    new_token_ids: List[int] = []
+    for i, token in enumerate(prompt_token_ids):
+        if token == image_token_id:
+            replacement_ids = repeat_and_pad_token(
+                image_token_id,
+                repeat_count=repeat_count,
+                pad_token_left=pad_token_left,
+                pad_token_right=pad_token_right,
+            )
+            new_token_ids.extend(replacement_ids)
+
+            # No need to further scan the list since we only replace once
+            new_token_ids.extend(prompt_token_ids[i + 1:])
+            break
+        else:
+            new_token_ids.append(token)
+
+    return new_prompt, new_token_ids
 
 
 class ImagePlugin(MultiModalPlugin):
@@ -27,7 +110,7 @@ class ImagePlugin(MultiModalPlugin):
             trust_remote_code=model_config.trust_remote_code)
 
     def _default_input_mapper(self, ctx: InputContext,
-                              data: object) -> Dict[str, torch.Tensor]:
+                              data: object) -> MultiModalInputs:
         model_config = ctx.model_config
         if isinstance(data, Image.Image):
             image_processor = self._get_hf_image_processor(model_config)
@@ -35,10 +118,15 @@ class ImagePlugin(MultiModalPlugin):
                 raise RuntimeError("No HuggingFace processor is available"
                                    "to process the image object")
             try:
-                return image_processor.preprocess(data, return_tensors="pt") \
-                    .to(model_config.dtype).data
+                batch_data = image_processor \
+                    .preprocess(data, return_tensors="pt") \
+                    .data
             except Exception:
                 logger.error("Failed to process image (%s)", data)
                 raise
 
-        raise TypeError(f"Invalid type for 'image': {type(data)}")
+            return MultiModalInputs(batch_data)
+        elif isinstance(data, torch.Tensor):
+            raise NotImplementedError("Embeddings input is not supported yet")
+
+        raise TypeError(f"Invalid image type: {type(data)}")
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a09a80f89..f17b04149 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,18 +1,17 @@
 import functools
-from typing import Optional, Sequence, Type, TypeVar
+from typing import Dict, Optional, Sequence
 
-from torch import nn
+import torch
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import MultiModalDataDict, MultiModalInputMapper, MultiModalPlugin
+from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
+                   MultiModalPlugin)
 from .image import ImagePlugin
 
 logger = init_logger(__name__)
 
-N = TypeVar("N", bound=Type[nn.Module])
-
 
 class MultiModalRegistry:
     """
@@ -61,7 +60,7 @@ class MultiModalRegistry:
         return self.register_input_mapper("image", mapper)
 
     def _process_input(self, key: str, value: object,
-                       model_config: ModelConfig):
+                       model_config: ModelConfig) -> MultiModalInputs:
         plugin = self._plugins.get(key)
         if plugin:
             return plugin.map_input(model_config, value)
@@ -93,16 +92,28 @@ class MultiModalRegistry:
         """
         return self.register_input_mapper("image", mapper)
 
-    def map_input(self, model_config: ModelConfig, data: MultiModalDataDict):
+    def map_input(self, model_config: ModelConfig,
+                  data: MultiModalDataDict) -> MultiModalInputs:
         """
         Apply an input mapper to the data passed to the model.
         
         See :meth:`MultiModalPlugin.map_input` for more details.
         """
-        result_list = [
-            self._process_input(k, v, model_config) for k, v in data.items()
-        ]
-        return {k: v for d in result_list for k, v in d.items()}
+        merged_dict: Dict[str, torch.Tensor] = {}
+
+        for data_key, data_value in data.items():
+            input_dict = self._process_input(data_key, data_value,
+                                             model_config)
+
+            for input_key, input_tensor in input_dict.items():
+                if input_key in merged_dict:
+                    raise ValueError(f"The input mappers (keys={set(data)}) "
+                                     f"resulted in a conflicting keyword "
+                                     f"argument to `forward()`: {input_key}")
+
+                merged_dict[input_key] = input_tensor
+
+        return MultiModalInputs(merged_dict)
 
     def create_input_mapper(self, model_config: ModelConfig):
         """
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 321b51e5a..e55b8bbfd 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -4,11 +4,56 @@ from typing import Optional, Union
 from urllib.parse import urlparse
 
 import aiohttp
+import requests
 from PIL import Image
 
-from vllm.config import ModelConfig
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
 from vllm.multimodal.base import MultiModalDataDict
+from vllm.version import __version__ as VLLM_VERSION
+
+
+def _validate_remote_url(url: str, *, name: str):
+    parsed_url = urlparse(url)
+    if parsed_url.scheme not in ["http", "https"]:
+        raise ValueError(f"Invalid '{name}': A valid '{name}' "
+                         "must have scheme 'http' or 'https'.")
+
+
+def _get_request_headers():
+    return {"User-Agent": f"vLLM/{VLLM_VERSION}"}
+
+
+def _load_image_from_bytes(b: bytes):
+    image = Image.open(BytesIO(b))
+    image.load()
+    return image
+
+
+def _load_image_from_data_url(image_url: str):
+    # Only split once and assume the second part is the base64 encoded image
+    _, image_base64 = image_url.split(",", 1)
+    return load_image_from_base64(image_base64)
+
+
+def fetch_image(image_url: str) -> Image.Image:
+    """Load PIL image from a url or base64 encoded openai GPT4V format"""
+    if image_url.startswith('http'):
+        _validate_remote_url(image_url, name="image_url")
+
+        headers = _get_request_headers()
+
+        with requests.get(url=image_url, headers=headers) as response:
+            response.raise_for_status()
+            image_raw = response.content
+        image = _load_image_from_bytes(image_raw)
+
+    elif image_url.startswith('data:image'):
+        image = _load_image_from_data_url(image_url)
+    else:
+        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
+                         "with either 'data:image' or 'http'.")
+
+    return image
 
 
 class ImageFetchAiohttp:
@@ -29,34 +74,31 @@ class ImageFetchAiohttp:
         """Load PIL image from a url or base64 encoded openai GPT4V format"""
 
         if image_url.startswith('http'):
-            parsed_url = urlparse(image_url)
-            if parsed_url.scheme not in ["http", "https"]:
-                raise ValueError("Invalid 'image_url': A valid 'image_url' "
-                                 "must have scheme 'http' or 'https'.")
-            # Avoid circular import
-            from vllm import __version__ as VLLM_VERSION
+            _validate_remote_url(image_url, name="image_url")
 
             client = cls.get_aiohttp_client()
-            headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}
+            headers = _get_request_headers()
 
             async with client.get(url=image_url, headers=headers) as response:
                 response.raise_for_status()
                 image_raw = await response.read()
-            image = Image.open(BytesIO(image_raw))
+            image = _load_image_from_bytes(image_raw)
 
-        # Only split once and assume the second part is the base64 encoded image
         elif image_url.startswith('data:image'):
-            image = load_image_from_base64(image_url.split(',', 1)[1])
-
+            image = _load_image_from_data_url(image_url)
         else:
             raise ValueError(
                 "Invalid 'image_url': A valid 'image_url' must start "
                 "with either 'data:image' or 'http'.")
 
-        image.load()
         return image
 
 
+async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
+    image = await ImageFetchAiohttp.fetch_image(image_url)
+    return {"image": image}
+
+
 def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
     """Encode a pillow image to base64 format."""
 
@@ -69,26 +111,11 @@ def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
 
 def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     """Load image from base64 format."""
-    return Image.open(BytesIO(base64.b64decode(image)))
+    return _load_image_from_bytes(base64.b64decode(image))
 
 
-# TODO(ywang96): move this to a model registry for preprocessing vision
-# language prompts based on the model type.
-def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
-                               config: ModelConfig) -> str:
-    """Combine image and text prompts for vision language model depending on
-    the model architecture."""
-
-    if config.hf_config.model_type in ("llava", "llava_next"):
-        full_prompt = f"{image_prompt}\n{text_prompt}"
-    elif config.hf_config.model_type == 'phi3_v':
-        full_prompt = f"{image_prompt}<s>\n{text_prompt}"
-    else:
-        raise ValueError(
-            f"Unsupported model type: {config.hf_config.model_type}")
-    return full_prompt
-
-
-async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
-    image = await ImageFetchAiohttp.fetch_image(image_url)
-    return {"image": image}
+def rescale_image_size(image: Image.Image, size_factor: float) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    return image.resize((new_width, new_height))
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 7e08586cd..d200115aa 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -457,7 +457,7 @@ class SequenceGroup:
         return next(iter(self.seqs_dict.values())).prompt_token_ids
 
     @property
-    def multi_modal_data(self) -> Optional["MultiModalDataDict"]:
+    def multi_modal_data(self) -> "MultiModalDataDict":
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).multi_modal_data
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 354dcb526..c7d9eabd0 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,9 +1,4 @@
-from transformers import AutoImageProcessor
-from transformers.image_processing_utils import BaseImageProcessor
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from typing import cast
 
 
 def get_image_processor(
@@ -11,10 +6,15 @@ def get_image_processor(
     *args,
     trust_remote_code: bool = False,
     **kwargs,
-) -> BaseImageProcessor:
+):
     """Gets an image processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoImageProcessor
+    from transformers.image_processing_utils import BaseImageProcessor
+
     try:
-        processor: BaseImageProcessor = AutoImageProcessor.from_pretrained(
+        processor = AutoImageProcessor.from_pretrained(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
@@ -34,4 +34,4 @@ def get_image_processor(
         else:
             raise e
 
-    return processor
+    return cast(BaseImageProcessor, processor)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index fd6c2b854..d8397ac22 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,6 +1,6 @@
-from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Type, Union)
 
 import torch
 from torch import nn
@@ -12,7 +12,8 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
 from vllm.utils import make_tensor_with_pad
@@ -40,7 +41,7 @@ class CPUModelInput(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -132,15 +133,14 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], Dict[
-            str, torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_kwargs_list: Dict[str,
-                                      List[torch.Tensor]] = defaultdict(list)
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -162,10 +162,9 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
             input_positions.extend(list(range(computed_len, seq_len)))
 
             mm_data = seq_group_metadata.multi_modal_data
-            if mm_data is not None:
+            if mm_data:
                 mm_kwargs = self.multi_modal_input_mapper(mm_data)
-                for k, v in mm_kwargs.items():
-                    multi_modal_kwargs_list[k].append(v)
+                multi_modal_inputs_list.append(mm_kwargs)
 
             # Compute the slot mapping.
             block_table = seq_group_metadata.block_tables[seq_id]
@@ -189,11 +188,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        multi_modal_kwargs = {
-            k: torch.cat(v, dim=0).to(self.device)
-            for k, v in multi_modal_kwargs_list.items()
-        }
-
         num_prompt_tokens = len(input_tokens)
 
         input_tokens = torch.tensor(input_tokens,
@@ -217,6 +211,10 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
             block_tables=torch.tensor([]),
             slot_mapping=slot_mapping,
         )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
 
@@ -367,10 +365,8 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
             "positions": model_input.input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if (self.vision_language_config
-                and model_input.multi_modal_kwargs is not None):
-            execute_model_kwargs.update(model_input.multi_modal_kwargs)
 
         hidden_states = model_executable(**execute_model_kwargs)
 
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 0e1bb1bfe..d3a2643cb 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -92,10 +92,9 @@ class EmbeddingModelRunner(
             "positions": model_input.input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if self.vision_language_config:
-            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-            execute_model_kwargs.update({"image_input": multi_modal_kwargs})
+
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index bd3028147..530c631d5 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -3,8 +3,8 @@ import gc
 import time
 import warnings
 from collections import defaultdict
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type,
-                    TypeVar, Union)
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set,
+                    Tuple, Type, TypeVar, Union)
 
 import numpy as np
 import torch
@@ -37,7 +37,8 @@ from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models.interfaces import supports_lora
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
@@ -83,7 +84,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
     lora_mapping: Optional["LoRAMapping"] = None
     lora_requests: Optional[Set[LoRARequest]] = None
     attn_metadata: Optional["AttentionMetadata"] = None
-    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
     request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
     finished_requests_ids: Optional[List[str]] = None
     virtual_engine: int = 0
@@ -356,8 +357,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         context_lens: List[int] = []
         query_lens: List[int] = []
         block_tables: List[List[int]] = []
-        multi_modal_kwargs_list: Dict[str,
-                                      List[torch.Tensor]] = defaultdict(list)
+        multi_modal_inputs_list: List[MultiModalInputs] = []
         request_ids_to_seq_ids: Dict[str, List[int]] = defaultdict(list)
         decode_only = True
         num_prefills = 0
@@ -528,8 +528,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 if mm_data:
                     # Process multi-modal data
                     mm_kwargs = self.multi_modal_input_mapper(mm_data)
-                    for k, v in mm_kwargs.items():
-                        multi_modal_kwargs_list[k].append(v)
+                    multi_modal_inputs_list.append(mm_kwargs)
 
                 is_profile_run = _is_block_tables_empty(
                     seq_group_metadata.block_tables)
@@ -746,10 +745,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         else:
             lora_mapping = None
 
-        multi_modal_kwargs = {
-            k: torch.cat(v, dim=0).to(self.device)
-            for k, v in multi_modal_kwargs_list.items()
-        }
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
         request_ids_to_seq_ids = {
             seq_group_metadata.request_id:
             list(seq_group_metadata.seq_data.keys())
@@ -821,7 +818,11 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
             seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
                 .dummy_data_for_profiling(model_config, seq_len)
-            assert len(seq_data.prompt_token_ids) == seq_len
+
+            # Having more tokens is over-conservative but otherwise fine
+            assert len(seq_data.prompt_token_ids) >= seq_len, (
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but got: {len(seq_data.prompt_token_ids)}")
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 8b96966be..423f44085 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Union)
 
 import torch
 from torch import nn
@@ -9,6 +10,8 @@ from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.neuron import get_neuron_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
@@ -29,6 +32,7 @@ class ModelInputForNeuron(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     input_block_ids: Optional[torch.Tensor] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -65,6 +69,10 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
         # Lazy initialization.
         self.model: nn.Module  # initialize after load_model.
 
@@ -76,13 +84,15 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], Mapping[
+            str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         input_block_ids: List[int] = []
 
         seq_lens: List[int] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
@@ -102,6 +112,12 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
             assert len(block_table) == 1
             input_block_ids.append(block_table[0])
 
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                # Process multi-modal data
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
         max_seq_len = max(seq_lens)
         assert max_seq_len > 0
         input_tokens = make_tensor_with_pad(input_tokens,
@@ -118,7 +134,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                                        dtype=torch.long,
                                        device=self.device)
 
-        return input_tokens, input_positions, input_block_ids, seq_lens
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
+        return (input_tokens, input_positions, input_block_ids, seq_lens,
+                multi_modal_kwargs)
 
     def _prepare_decode(
         self,
@@ -184,8 +204,9 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         is_prompt = seq_group_metadata_list[0].is_prompt
         # Prepare input tensors.
         if is_prompt:
-            (input_tokens, input_positions, input_block_ids,
-             seq_lens) = self._prepare_prompt(seq_group_metadata_list)
+            (input_tokens, input_positions, input_block_ids, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
         else:
             (input_tokens, input_positions,
              input_block_ids) = self._prepare_decode(seq_group_metadata_list)
@@ -203,7 +224,8 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         return ModelInputForNeuron(input_tokens=input_tokens,
                                    input_positions=input_positions,
                                    input_block_ids=input_block_ids,
-                                   sampling_metadata=sampling_metadata)
+                                   sampling_metadata=sampling_metadata,
+                                   multi_modal_kwargs=multi_modal_kwargs)
 
     @torch.inference_mode()
     def execute_model(
@@ -221,6 +243,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
             input_block_ids=model_input.input_block_ids,
+            **(model_input.multi_modal_kwargs or {}),
         )
 
         # Compute the logits.
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 336eaf814..f06404888 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,4 +1,4 @@
-from typing import List, NamedTuple, Optional, Tuple
+from typing import List, Mapping, NamedTuple, Optional, Tuple
 
 import openvino as ov
 import torch
@@ -12,6 +12,8 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.openvino import get_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 
 logger = init_logger(__name__)
@@ -23,7 +25,7 @@ class ModelInput(NamedTuple):
     attn_metadata: Optional[OpenVINOAttentionMetadata]
     seq_lens: List[int]
     query_lens: List[int]
-    multi_modal_input: Optional[torch.Tensor]
+    multi_modal_kwargs: Mapping[str, BatchedTensors]
 
     @classmethod
     def empty(cls, device):
@@ -32,7 +34,7 @@ class ModelInput(NamedTuple):
                           attn_metadata=None,
                           seq_lens=[],
                           query_lens=[],
-                          multi_modal_input=None)
+                          multi_modal_kwargs={})
 
 
 class OpenVINOModelRunner:
@@ -78,6 +80,10 @@ class OpenVINOModelRunner:
             self.block_size,
         )
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
@@ -108,6 +114,8 @@ class OpenVINOModelRunner:
         seq_lens: List[int] = []
         past_lens: List[int] = []
         query_lens: List[int] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
+
         subsequence_begins: List[int] = []
         block_indices: List[int] = []
         block_indices_begins: List[int] = []
@@ -160,6 +168,11 @@ class OpenVINOModelRunner:
                                     and self.sliding_window is None
                                     and is_prompt)
 
+                mm_data = seq_group_metadata.multi_modal_data
+                if mm_data:
+                    mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                    multi_modal_inputs_list.append(mm_kwargs)
+
                 block_table = seq_group_metadata.block_tables[seq_id]
                 # TODO(sang): Combine chunked prefill and prefix caching by
                 # only allowing multiple of block_size chunk size.
@@ -251,22 +264,24 @@ class OpenVINOModelRunner:
             block_indices_begins=block_indices_begins_tensor,
             max_context_len=max_context_len_tensor,
         )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return ModelInput(
             input_tokens,
             input_positions,
             attn_metadata,
             seq_lens,
             query_lens,
-            None,
+            multi_modal_kwargs=multi_modal_kwargs,
         )
 
     def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
-               SamplingMetadata, Optional[torch.Tensor], ]:
-        multi_modal_input = None
-
+               SamplingMetadata, Mapping[str, BatchedTensors]]:
         # Prepare input tensors.
         (
             input_tokens,
@@ -274,7 +289,7 @@ class OpenVINOModelRunner:
             attn_metadata,
             seq_lens,
             query_lens,
-            multi_modal_input,
+            multi_modal_kwargs,
         ) = self._prepare_model_input(seq_group_metadata_list)
 
         sampling_metadata = SamplingMetadata.prepare(
@@ -290,7 +305,7 @@ class OpenVINOModelRunner:
             input_positions,
             attn_metadata,
             sampling_metadata,
-            multi_modal_input,
+            multi_modal_kwargs,
         )
 
     @torch.inference_mode()
@@ -304,7 +319,7 @@ class OpenVINOModelRunner:
             input_positions,
             attn_metadata,
             sampling_metadata,
-            multi_modal_input,
+            multi_modal_kwargs,
         ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         model_executable = self.model
@@ -313,9 +328,8 @@ class OpenVINOModelRunner:
             "positions": input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": attn_metadata,
+            **(multi_modal_kwargs or {}),
         }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
 
         hidden_states = model_executable(**execute_model_kwargs)
 
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index dd08536ef..4ea8e62cc 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,5 +1,5 @@
 import time
-from typing import List, Optional, Tuple
+from typing import List, Mapping, Optional, Tuple
 
 import numpy as np
 import torch
@@ -12,6 +12,8 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            SamplerOutput, SequenceGroupMetadata,
                            SequenceOutput)
@@ -66,6 +68,10 @@ class TPUModelRunner:
             False,
         )
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
     def load_model(self) -> None:
         self.device = self.device_config.device
 
@@ -193,12 +199,14 @@ class TPUModelRunner:
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor,
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         prompt_lens: List[int] = []
         slot_mapping: List[List[int]] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -224,6 +232,11 @@ class TPUModelRunner:
                 slot = block_number * self.block_size + block_offset
                 slot_mapping[-1].append(slot)
 
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
         assert len(prompt_lens) > 0
         num_prefills = len(prompt_lens)
         num_prefill_tokens = sum(prompt_lens)
@@ -261,17 +274,24 @@ class TPUModelRunner:
             block_tables=None,
             context_lens=None,
         )
-        return input_tokens, input_positions, attn_metadata, prompt_lens
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
+        return (input_tokens, input_positions, attn_metadata, prompt_lens,
+                multi_modal_kwargs)
 
     def _prepare_decode(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor,
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         slot_mapping: List[List[int]] = []
         context_lens: List[int] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         batch_idx = 0
         for seq_group_metadata in seq_group_metadata_list:
@@ -297,6 +317,11 @@ class TPUModelRunner:
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append([slot])
 
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
         batch_size = _get_padded_batch_size(batch_idx)
         num_paddings = batch_size - batch_idx
         input_tokens = input_tokens + [[0]] * num_paddings
@@ -330,7 +355,12 @@ class TPUModelRunner:
             block_tables=block_tables,
             context_lens=context_lens,
         )
-        return input_tokens, input_positions, attn_metadata, input_lens
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
+        return (input_tokens, input_positions, attn_metadata, input_lens,
+                multi_modal_kwargs)
 
     def _prepare_sample(
         self,
@@ -483,6 +513,7 @@ class ModelWrapper(nn.Module):
         kv_caches: List[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]],
         attn_metadata: AttentionMetadata,
         input_lens: torch.Tensor,
+        multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]],
         t: torch.Tensor,
         p: torch.Tensor,
         num_samples: int,
@@ -496,6 +527,8 @@ class ModelWrapper(nn.Module):
                 memory profiling at initialization.
             attn_metadata: The Pallas attention metadata.
             input_lens: The actual input lengths of shape [batch_size].
+            multi_modal_kwargs: Keyword arguments from multi-modal data to
+                pass to the model.
             t: The sampling temperature of shape [batch_size].
             p: The top-p probability of shape [batch_size].
         """
@@ -540,6 +573,7 @@ class ModelWrapper(nn.Module):
             position_ids,
             kv_caches,
             attn_metadata,
+            **(multi_modal_kwargs or {}),
         )
         hidden_states = hidden_states.flatten(0, 1)
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index e652f1b10..f4fc42328 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Type, Union)
 
 import torch
 import torch.nn as nn
@@ -9,10 +10,13 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData,
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
 from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
@@ -44,7 +48,7 @@ class ModelInputForXPU(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_input: Optional[Dict[str, torch.Tensor]] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -116,6 +120,10 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
             self.block_size,
         )
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
@@ -156,12 +164,26 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         # To exercise the worst scenario for GPU memory consumption,
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
+        model_config = self.model_config
+        vlm_config = self.vision_language_config
+
+        if vlm_config:
+            max_num_seqs = min(
+                max_num_seqs,
+                int(max_num_batched_tokens / vlm_config.image_feature_size))
+
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
 
-            seq_data = SequenceData([0] * seq_len)
-            dummy_multi_modal_data = None
+            seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
+                .dummy_data_for_profiling(model_config, seq_len)
+
+            # Having more tokens is over-conservative but otherwise fine
+            assert len(seq_data.prompt_token_ids) >= seq_len, (
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but got: {len(seq_data.prompt_token_ids)}")
+
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
@@ -194,7 +216,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
             virtual_engine: int = 0,
             finished_requests_ids: Optional[List[str]] = None
     ) -> ModelInputForXPU:
-        multi_modal_input = None
+        multi_modal_kwargs = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or
             # all decodes.
@@ -202,7 +224,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
             # Prepare input tensors.
             if is_prompt:
                 (input_tokens, input_positions, attn_metadata, seq_lens,
-                 multi_modal_input
+                 multi_modal_kwargs
                  ) = self._prepare_prompt(seq_group_metadata_list)
             else:
                 (input_tokens, input_positions,
@@ -223,6 +245,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
                 "input_positions": input_positions,
                 "selected_token_indices":
                 sampling_metadata.selected_token_indices,
+                "multi_modal_kwargs": multi_modal_kwargs,
             }
             metadata_dict.update(attn_metadata.asdict_zerocopy())
             broadcast_tensor_dict(metadata_dict, src=0)
@@ -232,6 +255,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
             input_positions = metadata_dict.pop("input_positions")
             selected_token_indices = metadata_dict.pop(
                 "selected_token_indices")
+            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
             attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
             sampling_metadata = SamplingMetadata(
                 seq_groups=None,
@@ -244,7 +268,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
                                 input_positions=input_positions,
                                 attn_metadata=attn_metadata,
                                 sampling_metadata=sampling_metadata,
-                                multi_modal_input=multi_modal_input)
+                                multi_modal_kwargs=multi_modal_kwargs)
 
     def _prepare_decode(
         self,
@@ -350,10 +374,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
             "positions": model_input.input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if self.vision_language_config:
-            execute_model_kwargs.update(
-                {"image_input": model_input.multi_modal_input})
 
         hidden_states = model_executable(**execute_model_kwargs)
 
@@ -376,13 +398,13 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               Optional[torch.Tensor]]:
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -403,9 +425,10 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
             # is always the first token in the sequence.
             input_positions.extend(list(range(computed_len, seq_len)))
 
-            if seq_group_metadata.multi_modal_data:
-                multi_modal_input_list.append(
-                    seq_group_metadata.multi_modal_data.data)
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -435,15 +458,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        if multi_modal_input_list:
-            assert self.vision_language_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
-
         num_prompt_tokens = len(input_tokens)
 
         input_tokens = torch.tensor(input_tokens,
@@ -475,5 +489,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
             num_decode_tokens=0,
             block_tables=torch.tensor([], device=self.device, dtype=torch.int),
         )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return (input_tokens, input_positions, attn_metadata, seq_lens,
-                multi_modal_input)
+                multi_modal_kwargs)
-- 
GitLab


From d18bab3587e6804aaa74f93fb1d55ab5766f75a3 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Wed, 3 Jul 2024 13:31:25 +0900
Subject: [PATCH 246/376] [CI] Fix base url doesn't strip "/" (#6087)

---
 tests/entrypoints/openai/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 4fe925495..81f5254d9 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -606,7 +606,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
     [MODEL_NAME],
 )
 async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
-    base_url = str(client.base_url)[:-3]
+    base_url = str(client.base_url)[:-3].strip("/")
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
 
     for add_special in [False, True]:
-- 
GitLab


From d830656a9722bfc719426ce6bdd13b3d9d456304 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 2 Jul 2024 23:09:40 -0700
Subject: [PATCH 247/376] [BugFix] Avoid unnecessary Ray import warnings
 (#6079)

---
 vllm/config.py                  |  9 +++++++--
 vllm/engine/async_llm_engine.py |  5 +++++
 vllm/executor/ray_utils.py      | 23 ++++++++++++++++-------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index de8e119c9..24f536c04 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -682,11 +682,13 @@ class ParallelConfig:
 
             from vllm.executor import ray_utils
             backend = "mp"
-            ray_found = ray_utils.ray is not None
+            ray_found = ray_utils.ray_is_available()
             if cuda_device_count_stateless() < self.world_size:
                 if not ray_found:
                     raise ValueError("Unable to load Ray which is "
-                                     "required for multi-node inference")
+                                     "required for multi-node inference, "
+                                     "please install Ray with `pip install "
+                                     "ray`.") from ray_utils.ray_import_err
                 backend = "ray"
             elif ray_found:
                 if self.placement_group:
@@ -718,6 +720,9 @@ class ParallelConfig:
             raise ValueError(
                 "Unrecognized distributed executor backend. Supported values "
                 "are 'ray' or 'mp'.")
+        if self.distributed_executor_backend == "ray":
+            from vllm.executor import ray_utils
+            ray_utils.assert_ray_available()
         if not self.disable_custom_all_reduce and self.world_size > 1:
             if is_hip():
                 self.disable_custom_all_reduce = True
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 13b4635cb..33e40c7b3 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -380,6 +380,11 @@ class AsyncLLMEngine:
         """Creates an async LLM engine from the engine arguments."""
         # Create the engine configs.
         engine_config = engine_args.create_engine_config()
+
+        if engine_args.engine_use_ray:
+            from vllm.executor import ray_utils
+            ray_utils.assert_ray_available()
+
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
 
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 495fddd17..242d6c136 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -42,14 +42,26 @@ try:
             output = pickle.dumps(output)
             return output
 
+    ray_import_err = None
+
 except ImportError as e:
-    logger.warning(
-        "Failed to import Ray with %r. For multi-node inference, "
-        "please install Ray with `pip install ray`.", e)
     ray = None  # type: ignore
+    ray_import_err = e
     RayWorkerWrapper = None  # type: ignore
 
 
+def ray_is_available() -> bool:
+    """Returns True if Ray is available."""
+    return ray is not None
+
+
+def assert_ray_available():
+    """Raise an exception if Ray is not available."""
+    if ray is None:
+        raise ValueError("Failed to import Ray, please install Ray with "
+                         "`pip install ray`.") from ray_import_err
+
+
 def initialize_ray_cluster(
     parallel_config: ParallelConfig,
     ray_address: Optional[str] = None,
@@ -65,10 +77,7 @@ def initialize_ray_cluster(
         ray_address: The address of the Ray cluster. If None, uses
             the default Ray cluster address.
     """
-    if ray is None:
-        raise ImportError(
-            "Ray is not installed. Please install Ray to use multi-node "
-            "serving.")
+    assert_ray_available()
 
     # Connect to a ray cluster.
     if is_hip() or is_xpu():
-- 
GitLab


From f666207161e5ecc2cd6ebab93c5a62cf44f30641 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 2 Jul 2024 23:37:29 -0700
Subject: [PATCH 248/376] [misc][distributed] error on invalid state (#6092)

---
 vllm/executor/multiproc_gpu_executor.py |  3 +++
 vllm/executor/ray_gpu_executor.py       |  5 ++++-
 vllm/utils.py                           | 22 ++++++++++++++++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 5bfeac0cf..ae5062bd6 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -10,6 +10,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (cuda_device_count_stateless,
+                        error_on_invalid_device_count_status,
                         get_distributed_init_method, get_open_port,
                         get_vllm_instance_id, make_async,
                         update_environment_variables)
@@ -39,6 +40,8 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
         assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")
 
+        error_on_invalid_device_count_status()
+
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index e742d11bb..e0b9441a9 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import (  # yapf: disable
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (error_on_invalid_device_count_status,
+                        get_distributed_init_method, get_ip, get_open_port,
                         get_vllm_instance_id, make_async)
 
 if ray is not None:
@@ -175,6 +176,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
+        error_on_invalid_device_count_status()
+
         # Initialize the actual workers inside worker wrapper.
         init_worker_all_kwargs = [
             self._get_worker_kwargs(
diff --git a/vllm/utils.py b/vllm/utils.py
index 763b0b91c..854decc29 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,5 +1,6 @@
 import argparse
 import asyncio
+import contextlib
 import datetime
 import enum
 import gc
@@ -816,6 +817,27 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
+def error_on_invalid_device_count_status():
+    cache_entries = 0
+    with contextlib.suppress(Exception):
+        # future pytorch will fix the issue, device_count will not be cached
+        # at that time, `.cache_info().currsize` will error out
+        cache_entries = torch.cuda.device_count.cache_info().currsize
+    if cache_entries != 0:
+        # the function is already called, and the result is cached
+        remembered = torch.cuda.device_count()
+        current = cuda_device_count_stateless()
+        if remembered > current:
+            raise RuntimeError(
+                "The number of CUDA devices has changed since the first "
+                "call to torch.cuda.device_count(). This is not allowed "
+                "and may result in undefined behavior. Please check out "
+                "https://github.com/vllm-project/vllm/issues/6056 to "
+                "find the first call to torch.cuda.device_count() "
+                "and defer it until the engine is up. Or you can set "
+                "CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
+
+
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
 # all the related functions work on real physical device ids.
-- 
GitLab


From 3a86b54fb00bde01da2680bf3cfd989b6b21511c Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 2 Jul 2024 23:41:23 -0700
Subject: [PATCH 249/376] [VLM][Frontend] Proper Image Prompt Formatting from
 OpenAI API (#6091)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py | 37 ++++++++++++++++++-------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 57ad7bdd3..06c82d5e8 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -127,6 +127,16 @@ class OpenAIServingChat(OpenAIServing):
 
         return self.tokenizer.decode(image_token_id)
 
+    # TODO: Let user specify how to insert image tokens into prompt
+    # (similar to chat template)
+    def _get_full_image_text_prompt(self, image_token_str: str,
+                                    text_prompt: str) -> str:
+        """Combine image and text prompts for vision language model"""
+
+        # NOTE: For now we assume all model architectures use the same
+        # image + text prompt format. This may change in the future.
+        return f"{image_token_str}\n{text_prompt}"
+
     def _parse_chat_message_content_parts(
         self,
         role: str,
@@ -146,15 +156,6 @@ class OpenAIServingChat(OpenAIServing):
                         "Multiple 'image_url' input is currently not supported."
                     )
 
-                image_token_str = self.image_token_str
-                if image_token_str is not None:
-                    if any(image_token_str in text for text in texts):
-                        logger.warning(
-                            "Detected image token string in the text prompt. "
-                            "Skipping prompt formatting.")
-                    else:
-                        texts.append(image_token_str)
-
                 image_url = cast(ChatCompletionContentPartImageParam,
                                  part)["image_url"]
 
@@ -169,6 +170,20 @@ class OpenAIServingChat(OpenAIServing):
                 raise NotImplementedError(f"Unknown part type: {part_type}")
 
         text_prompt = "\n".join(texts)
+
+        if mm_futures:
+            image_token_str = self.image_token_str
+            if image_token_str is not None:
+                if image_token_str in text_prompt:
+                    logger.warning(
+                        "Detected image token string in the text prompt. "
+                        "Skipping prompt formatting.")
+                else:
+                    text_prompt = self._get_full_image_text_prompt(
+                        image_token_str=image_token_str,
+                        text_prompt=text_prompt,
+                    )
+
         messages = [ConversationMessage(role=role, content=text_prompt)]
 
         return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
@@ -238,7 +253,9 @@ class OpenAIServingChat(OpenAIServing):
         try:
             if len(mm_futures):
                 # since we support only single mm data currently
-                assert len(mm_futures) == 1
+                assert len(
+                    mm_futures
+                ) == 1, "Multiple 'image_url' input is currently not supported."
                 mm_data = await mm_futures[0]
         except Exception as e:
             logger.error("Error in loading multi-modal data: %s", e)
-- 
GitLab


From f1c78138aa28e58eeaafa4791788fe6ceddf1dd8 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 3 Jul 2024 00:13:56 -0700
Subject: [PATCH 250/376] [Doc] Fix Mock Import (#6094)

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index af1f22b23..7c5bb8f79 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -96,6 +96,7 @@ autodoc_mock_imports = [
     'triton',
     "tqdm",
     "tensorizer",
+    "pynvml",
 ]
 
 for mock_target in autodoc_mock_imports:
-- 
GitLab


From 7cd2ebb0251fd1fd0eec5c93dac674603a22eddd Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 3 Jul 2024 00:32:35 -0700
Subject: [PATCH 251/376] [Bugfix] Fix `compute_logits` in Jamba (#6093)

---
 vllm/model_executor/models/jamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index c485d3779..bf330c777 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -876,7 +876,7 @@ class JambaForCausalLM(nn.Module):
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 
-- 
GitLab


From 47f0954af0a5aefd0db19875f6bdcbe933d055a9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 3 Jul 2024 13:38:00 -0400
Subject: [PATCH 252/376] [Kernel] Expand FP8 support to Ampere GPUs using FP8
 Marlin (#5975)

---
 CMakeLists.txt                                |    1 +
 csrc/ops.h                                    |    5 +
 csrc/quantization/fp8/fp8_marlin.cu           | 1308 +++++++++++++++++
 csrc/torch_bindings.cpp                       |    4 +
 docs/source/quantization/fp8.rst              |    3 +-
 .../quantization/supported_hardware.rst       |    2 +-
 tests/kernels/test_marlin_gemm.py             |   88 +-
 tests/quantization/test_fp8.py                |   19 +-
 vllm/_custom_ops.py                           |    9 +
 .../model_executor/layers/quantization/fp8.py |  164 ++-
 .../layers/quantization/utils/marlin_utils.py |   28 +-
 11 files changed, 1587 insertions(+), 44 deletions(-)
 create mode 100644 csrc/quantization/fp8/fp8_marlin.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ede9192cd..31f7a9738 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,6 +171,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/quantization/fp8/fp8_marlin.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
diff --git a/csrc/ops.h b/csrc/ops.h
index 8a92afdc8..fb1099e4f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -93,6 +93,11 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
+torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                              torch::Tensor& b_scales, torch::Tensor& workspace,
+                              int64_t num_bits, int64_t size_m, int64_t size_n,
+                              int64_t size_k);
+
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
diff --git a/csrc/quantization/fp8/fp8_marlin.cu b/csrc/quantization/fp8/fp8_marlin.cu
new file mode 100644
index 000000000..51ff07198
--- /dev/null
+++ b/csrc/quantization/fp8/fp8_marlin.cu
@@ -0,0 +1,1308 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#include "../gptq_marlin/gptq_marlin.cuh"
+#include "../gptq_marlin/gptq_marlin_dtypes.cuh"
+
+using namespace gptq_marlin;
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace fp8_marlin {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <typename scalar_t,          // compute dtype, half or nv_float16
+          const int num_bits,         // number of bits used for weights
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int* locks       // extra global storage for barrier synchronization
+) {}
+
+}  // namespace fp8_marlin
+
+torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                              torch::Tensor& b_scales, torch::Tensor& workspace,
+                              int64_t num_bits, int64_t size_m, int64_t size_n,
+                              int64_t size_k) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+                           const typename ScalarType<scalar_t>::FragB& frag_b,
+                           typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <typename scalar_t>
+__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
+                             const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+// Fast FP8ToFp16/FP8ToBf16: Efficiently dequantize 8bit fp8_e4m3 values to fp16
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+template <typename scalar_t>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
+  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+}
+
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
+
+  // Calculate MASK for extracting mantissa and exponent
+  constexpr int MASK1 = 0x80000000;
+  constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
+  constexpr int MASK3 = MASK2 & 0x7fffffff;
+  constexpr int MASK = MASK3 | (MASK3 >> 16);
+  // Final MASK value: 0x7F007F00
+
+  // Extract and shift FP8 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  typename ScalarType<half>::FragB frag_b;
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = __hmul2(*reinterpret_cast<const half2*>(&Out1), bias_reg);
+  frag_b[0] = __hmul2(*reinterpret_cast<const half2*>(&Out2), bias_reg);
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant_8bit<nv_bfloat16>(int q) {
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+
+  // Calculate MASK for extracting mantissa and exponent
+  constexpr int MASK1 = 0x80000000;
+  constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
+  constexpr int MASK3 = MASK2 & 0x7fffffff;
+  constexpr int MASK = MASK3 | (MASK3 >> 16);
+  // Final MASK value: 0x7F007F00
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to bfloat162 and apply bias
+  typename ScalarType<nv_bfloat16>::FragB frag_b;
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out1), bias_reg);
+  frag_b[0] = __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out2), bias_reg);
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
+                             typename ScalarType<scalar_t>::FragS& frag_s,
+                             int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c,
+                                   typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+template <typename scalar_t,          // compute dtype, half or nv_float16
+          const int num_bits,         // number of bits used for weights
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int* locks       // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+
+  constexpr int pack_factor = 32 / num_bits;
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // We scale a `half2` tile in row-major layout for column-wise quantization.
+  int s_sh_rd =
+      8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4;
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+
+  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
+                                    int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4<scalar_t>(frag_a[k % 2][i],
+                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    is_same_group[pipe] = false;
+    same_group_id[pipe] = 0;
+    return;
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+
+      int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+      int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+      int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+
+      frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
+      frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2)],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<scalar_t*>(&c)[j] =
+                  Dtype::float2num(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res =
+          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      ((scalar_t2*)sh)[idx] = res;
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (s_sh_wr_pred) {
+        cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+      }
+      cp_async_fence();
+
+      thread_block_reduce();
+
+      cp_async_wait<0>();
+      __syncthreads();
+      if (threadIdx.x / 32 < thread_n_blocks / 4) {
+        reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+        reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+          for (int j = 0; j < 4; j++) {
+            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                                  frag_s[j / 2][2 * (j % 2) + 0]);
+            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                                  frag_s[j / 2][2 * (j % 2) + 0]);
+
+            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                                  frag_s[j / 2][2 * (j % 2) + 1]);
+            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                                  frag_s[j / 2][2 * (j % 2) + 1]);
+          }
+        }
+      }
+
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+
+        start_pipes();
+      }
+    }
+  }
+}
+
+  #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
+                    THREAD_K_BLOCKS, GROUP_BLOCKS, NUM_THREADS)                \
+    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {     \
+      cudaFuncSetAttribute(                                                    \
+          Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
+                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, GROUP_BLOCKS>, \
+          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
+      Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
+             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, GROUP_BLOCKS>      \
+          <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                   \
+              A_ptr, B_ptr, C_ptr, s_ptr, num_groups, prob_m, prob_n, prob_k,  \
+              locks);                                                          \
+    }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+typedef struct {
+  int max_m_blocks;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128},
+    {128, 64, 128},
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128},
+    {128, 64, 128},
+
+};
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits,
+                          int group_size) {
+  int tb_n = th_config.thread_n;
+
+  // Get max scale groups per thread-block
+  // Fixed for channelwise
+  int tb_groups = 1;
+  int tb_scales = tb_groups * tb_n * 2;
+
+  return tb_scales * pipe_stages;
+}
+
+bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
+                         int prob_m, int prob_n, int prob_k, int num_bits,
+                         int scales_cache_size, int max_shared_mem) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+
+  int b_size = (tb_k * tb_n / pack_factor) * 4;
+
+  // Get A size
+  int m_blocks = div_ceil(prob_m, 16);
+  int tb_max_m = 16;
+
+  while (true) {
+    if (m_blocks >= max_m_blocks) {
+      tb_max_m *= max_m_blocks;
+      break;
+    }
+
+    max_m_blocks--;
+    if (max_m_blocks == 0) {
+      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
+    }
+  }
+
+  int a_size = (tb_max_m * tb_k) * 2;
+
+  float pipe_size = (a_size + b_size) * pipe_stages;
+
+  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
+
+  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
+}
+
+bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  //  Determine cache for scales
+  int scales_cache_size = get_scales_cache_size(th_config, prob_m, prob_n,
+                                                prob_k, num_bits, group_size);
+
+  // Check that pipeline fits into cache
+  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                           num_bits, scales_cache_size, max_shared_mem)) {
+    return false;
+  }
+
+  return true;
+}
+
+exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
+                                      int num_bits, int group_size,
+                                      int max_shared_mem) {
+  int max_m_blocks = 4;
+  while (max_m_blocks > 0) {
+    if (prob_m <= 16) {
+      for (auto th_config : small_batch_thread_configs) {
+        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                            num_bits, group_size, max_shared_mem)) {
+          return exec_config_t{max_m_blocks, th_config};
+        }
+      }
+    } else {
+      for (auto th_config : large_batch_thread_configs) {
+        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                            num_bits, group_size, max_shared_mem)) {
+          return exec_config_t{max_m_blocks, th_config};
+        }
+      }
+    }
+
+    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
+                     // usage
+  }
+
+  return exec_config_t{0, {-1, -1, -1}};
+}
+
+  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)
+
+template <typename scalar_t>
+void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, int prob_m,
+                     int prob_n, int prob_k, void* workspace, int num_bits,
+                     int num_groups, int group_size, int dev,
+                     cudaStream_t stream, int thread_k, int thread_n, int sms,
+                     int max_par) {
+  TORCH_CHECK(num_bits == 8, "num_bits must be 8. Got = ", num_bits);
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int tot_m = prob_m;
+  int tot_m_blocks = div_ceil(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1) {
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  exec_config_t exec_cfg;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    exec_cfg =
+        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
+  } else {
+    // Auto config
+    exec_cfg = determine_thread_config(prob_m, prob_n, prob_k, num_bits,
+                                       group_size, max_shared_mem);
+  }
+
+  TORCH_CHECK(
+      exec_cfg.max_m_blocks > 0 &&
+          is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks, prob_m,
+                          prob_n, prob_k, num_bits, group_size, max_shared_mem),
+      "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
+      ", thread_k = ", exec_cfg.tb_cfg.thread_k,
+      ", thread_n = ", exec_cfg.tb_cfg.thread_n,
+      ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [", prob_m,
+      ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+      ", group_size = ", group_size, ", max_shared_mem = ", max_shared_mem);
+
+  int num_threads = exec_cfg.tb_cfg.num_threads;
+  thread_k = exec_cfg.tb_cfg.thread_k;
+  thread_n = exec_cfg.tb_cfg.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  int blocks = sms;
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+
+  int group_blocks = -1;
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  const int4* s_ptr = (const int4*)s;
+
+  int* locks = (int*)workspace;
+
+  // Main loop
+  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > exec_cfg.max_m_blocks) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
+      if (par > max_par) par = max_par;
+      prob_m = (16 * exec_cfg.max_m_blocks) * par;
+      i += exec_cfg.max_m_blocks * (par - 1);
+      thread_m_blocks = exec_cfg.max_m_blocks;
+    }
+
+    // Define kernel configurations
+    if (false) {
+    }
+    CALL_IF(8, 32, 2, 256)
+    CALL_IF(8, 16, 4, 256)
+    CALL_IF(8, 8, 8, 256)
+    CALL_IF(8, 8, 4, 128)
+    CALL_IF(8, 4, 8, 128)
+    else {
+      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+                             str(prob_n) + ", " + str(prob_k) + "]" +
+                             ", num_groups = " + str(num_groups) +
+                             ", group_size = " + str(group_size) +
+                             ", thread_m_blocks = " + str(thread_m_blocks) +
+                             ", thread_n_blocks = " + str(thread_n_blocks) +
+                             ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+  }
+}
+
+}  // namespace fp8_marlin
+
+torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                              torch::Tensor& b_scales, torch::Tensor& workspace,
+                              int64_t num_bits, int64_t size_m, int64_t size_n,
+                              int64_t size_k) {
+  // Verify num_bits
+  TORCH_CHECK(num_bits == 8, "num_bits must be 8. Got = ", num_bits);
+  int pack_factor = 32 / num_bits;
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_size = ", gptq_marlin::tile_size);
+  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
+  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
+              "b_q_weight.size(1) = ", b_q_weight.size(1),
+              " is not divisible by tile_size = ", gptq_marlin::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int b_rank = b_scales.sizes().size();
+  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
+  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
+              " is not size_n = ", size_n);
+  // Channelwise only for FP8
+  TORCH_CHECK(b_scales.size(0) == 1)
+  num_groups = b_scales.size(0);
+
+  // Verify workspace size
+  TORCH_CHECK(
+      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
+      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
+  int min_workspace_size =
+      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    fp8_marlin::marlin_mm_f16i4<half>(
+        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+        b_scales.data_ptr<at::Half>(), size_m, size_n, size_k,
+        workspace.data_ptr(), num_bits, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        gptq_marlin::max_par);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    fp8_marlin::marlin_mm_f16i4<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(), size_m,
+        size_n, size_k, workspace.data_ptr(), num_bits, num_groups, group_size,
+        dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        gptq_marlin::max_par);
+  } else {
+    TORCH_CHECK(false, "fp8_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index faf29e1f1..18331a674 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -137,6 +137,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gptq_marlin_repack", &gptq_marlin_repack);
   ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
 
+  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
+  ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
+  ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
+
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization.
   ops.def(
diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst
index 09f313664..7f796fc3a 100644
--- a/docs/source/quantization/fp8.rst
+++ b/docs/source/quantization/fp8.rst
@@ -4,7 +4,8 @@ FP8
 ==================
 
 vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. 
-Currently, only Hopper and Ada Lovelace GPUs are supported. 
+Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. 
+Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels.
 Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
 
 Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127>`_.
diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst
index df445e00a..ecc330d86 100644
--- a/docs/source/quantization/supported_hardware.rst
+++ b/docs/source/quantization/supported_hardware.rst
@@ -11,7 +11,7 @@ Implementation  Volta   Turing   Ampere   Ada    Hopper  AMD GPU  Intel GPU  x86
 AQLM            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
 AWQ             ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
 DeepSpeedFP     ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-FP8             ❌      ❌       ❌       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+FP8             ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
 Marlin          ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
 GPTQ            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
 SqueezeLLM      ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 1f8d94bad..92ddcb209 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -8,7 +8,8 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS,
+    marlin_permute_scales)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
@@ -16,7 +17,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_perms import (
     marlin_perm)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     MarlinWorkspace, compute_max_diff, is_marlin_supported, marlin_24_quantize,
-    marlin_quantize, marlin_weights)
+    marlin_quantize, marlin_weights, pack_fp8_to_int32)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     gptq_pack, quantize_weights, sort_weights)
 
@@ -38,9 +39,11 @@ MNK_FACTORS = [
     (67, 13, 11),
 ]
 
+DTYPES = [torch.float16, torch.bfloat16]
 
-def rand_data(shape):
-    return torch.randn(shape, dtype=torch.half, device="cuda")
+
+def rand_data(shape, dtype=torch.float16):
+    return torch.randn(shape, dtype=dtype, device="cuda")
 
 
 @pytest.mark.skipif(not is_marlin_supported(),
@@ -217,3 +220,80 @@ def test_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size, mnk_factors):
     print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
+
+
+@pytest.mark.skipif(not is_marlin_supported(),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", [8])
+@pytest.mark.parametrize("group_size", [-1])
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_fp8_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    num_bits,
+    group_size,
+    mnk_factors,
+    dtype,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    print(f"MNK = {size_m} {size_n} {size_k}")
+    print(f"groupsize = {group_size}")
+
+    a_input = rand_data((size_m, size_k), dtype=dtype)
+    b_weight = rand_data((size_k, size_n), dtype=dtype)
+
+    # WEIGHTS
+    fp8_weight, weight_scale = ops.scaled_fp8_quant(b_weight, scale=None)
+    # Repack weights to gptq format (packed int32 elements)
+    packed_gptq_qweight = pack_fp8_to_int32(fp8_weight)
+    # Repack weights to marlin format
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=packed_gptq_qweight,
+        perm=torch.empty(0, dtype=torch.int, device="cuda"),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+
+    # WEIGHT SCALES
+    # Currently Marlin doesn't support per-tensor scales, so we
+    # expand it to channelwise
+    scales = weight_scale.repeat(1, size_n).to(a_input.dtype).to("cuda")
+    # Permute scales
+    marlin_scales = marlin_permute_scales(
+        s=scales,
+        size_k=size_k,
+        size_n=size_n,
+        group_size=-1,
+        num_bits=8,
+    )
+
+    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    output = ops.fp8_marlin_gemm(
+        a=a_input,
+        b_q_weight=marlin_qweight,
+        b_scales=marlin_scales,
+        workspace=workspace.scratch,
+        num_bits=num_bits,
+        size_m=a_input.shape[0],
+        size_n=b_weight.shape[1],
+        size_k=a_input.shape[1],
+    )
+    output_ref = torch.matmul(a_input, b_weight)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+    print("max_diff = {}".format(max_diff))
+
+    assert max_diff < 0.04
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 4d76ae707..0ed91cbb4 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -6,7 +6,7 @@ import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
-from vllm._custom_ops import scaled_fp8_quant
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 
 MODELS = [
@@ -35,7 +35,16 @@ def test_load_fp16_model(vllm_runner) -> None:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         fc1 = model.model.decoder.layers[0].fc1
         assert isinstance(fc1.quant_method, Fp8LinearMethod)
-        assert fc1.weight.dtype == torch.float8_e4m3fn
+
+        capability = torch.cuda.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        if capability >= 89:
+            # For GPUs with hardware support, we keep weights in fp8
+            assert fc1.weight.dtype == torch.float8_e4m3fn
+        else:
+            # For GPUs without hardware support, we pack the fp8 weights
+            # for weight-only quantization using Marlin kernels
+            assert fc1.weight.dtype == torch.int32
 
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -63,7 +72,7 @@ def test_scaled_fp8_quant(dtype) -> None:
     x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
 
     # Dynamic quantization
-    ref_y, inv_scale = scaled_fp8_quant(x, None)
+    ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
     ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
 
     # Reference dynamic quantizaton
@@ -71,11 +80,11 @@ def test_scaled_fp8_quant(dtype) -> None:
     assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Static quantization
-    y, _ = scaled_fp8_quant(x, inv_scale)
+    y, _ = ops.scaled_fp8_quant(x, inv_scale)
     assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Padding
-    y, _ = scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    y, _ = ops.scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
     assert y.shape[0] == 17
     assert torch.allclose(
         ref_y,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 479ea08e4..03308d040 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -271,6 +271,15 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                                          size_k, is_k_full)
 
 
+# fp8 marlin
+def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    b_scales: torch.Tensor, workspace: torch.Tensor,
+                    num_bits: int, size_m: int, size_n: int,
+                    size_k: int) -> torch.Tensor:
+    return torch.ops._C.fp8_marlin_gemm(a, b_q_weight, b_scales, workspace,
+                                        num_bits, size_m, size_n, size_k)
+
+
 # fp8
 def scaled_fp8_quant(
     input: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6d942fa61..544774891 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -11,6 +11,11 @@ from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState,
+    marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    pack_fp8_to_int32)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import print_warning_once
@@ -54,7 +59,7 @@ class Fp8Config(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 89
+        return 80
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -106,6 +111,12 @@ class Fp8LinearMethod(LinearMethodBase):
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
 
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        capability = current_platform.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        self.use_marlin = capability < 89
+
     def _create_scale_param(
         self,
         scale_name: str,
@@ -139,6 +150,10 @@ class Fp8LinearMethod(LinearMethodBase):
         layer.process_after_load = True
         layer.logical_widths = output_partition_sizes
 
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
         # WEIGHT
         weight_dtype = (torch.float8_e4m3fn
                         if self.quant_config.is_checkpoint_fp8_serialized else
@@ -172,6 +187,65 @@ class Fp8LinearMethod(LinearMethodBase):
                     output_partition_sizes=output_partition_sizes,
                     **extra_weight_attrs)
 
+        # For GPUs without FP8 hardware support, we use Marlin for fast
+        # fused dequantization
+        if self.use_marlin:
+            layer.marlin_state = GPTQMarlinState.REPACK
+
+    def prepare_layer_for_marlin(self, layer: Module) -> None:
+        print_warning_once(
+            "Your GPU does not have native support for FP8 computation but "
+            "FP8 quantization is being used. Weight-only FP8 compression will "
+            "be used leveraging the Marlin kernel. This may degrade "
+            "performance for compute-heavy workloads.")
+
+        part_size_n = layer.output_size_per_partition
+        part_size_k = layer.input_size_per_partition
+
+        assert layer.marlin_state == GPTQMarlinState.REPACK
+        layer.marlin_state = GPTQMarlinState.READY
+
+        device = layer.weight.device
+
+        # WEIGHTS
+        # Repack weights to gptq format (packed int32 elements)
+        packed_gptq_qweight = pack_fp8_to_int32(layer.weight)
+
+        # Repack weights to marlin format
+        marlin_qweight = ops.gptq_marlin_repack(
+            b_q_weight=packed_gptq_qweight,
+            perm=torch.empty(0, dtype=torch.int, device=device),
+            size_k=part_size_k,
+            size_n=part_size_n,
+            num_bits=8,
+        )
+        layer.weight = Parameter(marlin_qweight, requires_grad=False)
+
+        # WEIGHT SCALES
+        # Currently Marlin doesn't support per-tensor scales, so we
+        # expand it to channelwise
+        scales = layer.weight_scale.repeat(1, part_size_n).to(
+            layer.orig_dtype).to(device)
+        # Permute scales
+        marlin_scales = marlin_permute_scales(
+            s=scales,
+            size_k=part_size_k,
+            size_n=part_size_n,
+            group_size=-1,
+            num_bits=8,
+        )
+        layer.weight_scale = Parameter(marlin_scales, requires_grad=False)
+
+        # Allocate marlin workspace
+        max_workspace_size = (
+            part_size_n // GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                device=device,
+                                requires_grad=False)
+
+        layer.workspace = workspace
+
     def process_weights_after_loading(self, layer: Module) -> None:
         if (not hasattr(layer, "process_after_load")
                 or not layer.process_after_load):
@@ -185,6 +259,8 @@ class Fp8LinearMethod(LinearMethodBase):
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.logical_widths = None
             layer.input_scale = None
+            if self.use_marlin:
+                self.prepare_layer_for_marlin(layer)
             return
 
         # If checkpoint is fp8, requantize the separately quantized logical
@@ -233,44 +309,72 @@ class Fp8LinearMethod(LinearMethodBase):
                 raise ValueError(
                     f"Unknown scheme {self.quant_config.activation_scheme}")
 
+            if self.use_marlin:
+                self.prepare_layer_for_marlin(layer)
+
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        # ops.scaled_fp8_quant supports both dynamic and static quant.
-        #   If dynamic, layer.input_scale is None and x_scale computed from x.
-        #   If static, layer.input_scale is scalar and x_scale is input_scale.
+        if self.use_marlin:
+            # For GPUs that lack FP8 hardware support, we can leverage the
+            # Marlin kernel for fast weight-only FP8 quantization
+
+            reshaped_x = x.reshape(-1, x.shape[-1])
+            out_shape = x.shape[:-1] + (layer.output_size_per_partition, )
+
+            output = ops.fp8_marlin_gemm(
+                a=reshaped_x,
+                b_q_weight=layer.weight,
+                b_scales=layer.weight_scale,
+                workspace=layer.workspace,
+                num_bits=8,
+                size_m=reshaped_x.shape[0],
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+            )
 
-        if bias is None and self.cutlass_fp8_supported:
-            qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
+            if bias is not None:
+                output.add_(bias)  # In-place add
 
-            # Fused GEMM_DQ
-            output = ops.cutlass_scaled_mm(
-                qinput,
-                layer.weight,
-                out_dtype=x.dtype,
-                scale_a=x_scale,
-                scale_b=layer.weight_scale,
-            )
+            return output.reshape(out_shape)
 
         else:
-            qinput, x_scale = ops.scaled_fp8_quant(x,
-                                                   layer.input_scale,
-                                                   batch_dim_padding=17)
-
-            # Fused GEMM_DQ -- note we padded the input above because
-            # torch._scaled_mm is more performant for matrices with
-            # batch dimension > 16. Note that this could change
-            # in the future.
-            output, _ = torch._scaled_mm(
-                qinput,
-                layer.weight,
-                out_dtype=x.dtype,
-                scale_a=x_scale,
-                scale_b=layer.weight_scale,
-                bias=bias,
-            )
+
+            # ops.scaled_fp8_quant supports both dynamic and static quant.
+            # If dynamic, layer.input_scale is None and x_scale computed from x
+            # If static, layer.input_scale is scalar and x_scale is input_scale
+
+            if bias is None and self.cutlass_fp8_supported:
+                qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
+
+                # Fused GEMM_DQ
+                output = ops.cutlass_scaled_mm(
+                    qinput,
+                    layer.weight,
+                    out_dtype=x.dtype,
+                    scale_a=x_scale,
+                    scale_b=layer.weight_scale,
+                )
+
+            else:
+                qinput, x_scale = ops.scaled_fp8_quant(x,
+                                                       layer.input_scale,
+                                                       batch_dim_padding=17)
+
+                # Fused GEMM_DQ -- note we padded the input above because
+                # torch._scaled_mm is more performant for matrices with
+                # batch dimension > 16. Note that this could change
+                # in the future.
+                output, _ = torch._scaled_mm(
+                    qinput,
+                    layer.weight,
+                    out_dtype=x.dtype,
+                    scale_a=x_scale,
+                    scale_b=layer.weight_scale,
+                    bias=bias,
+                )
 
         return torch.narrow(output, 0, 0, x.shape[0])
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index ecd29a80e..66ce19592 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -14,13 +14,12 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     get_pack_factor, quantize_weights, sort_weights)
 from vllm.platforms import current_platform
 
-__cuda_arch = current_platform.get_device_capability()
-
 MARLIN_TILE = 16
 
 
 def is_marlin_supported():
-    return __cuda_arch[0] >= 8
+    capability = current_platform.get_device_capability()
+    return capability[0] >= 8
 
 
 def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE):
@@ -223,3 +222,26 @@ class MarlinWorkspace:
         self.scratch = torch.zeros(max_workspace_size,
                                    dtype=torch.int,
                                    device="cuda")
+
+
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Repack FP8 weights to gptq format (packed int32 elements)
+    """
+    assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.shape[0] % 4 == 0
+
+    # Reshape to prepare for packing
+    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
+
+    # Convert fp8 to uint8 (byte) representation
+    byte_tensor = reshaped.view(torch.uint8)
+
+    # Pack 4 uint8 values into one int32
+    packed = (byte_tensor[:, 0].to(torch.int32) |
+              (byte_tensor[:, 1].to(torch.int32) << 8) |
+              (byte_tensor[:, 2].to(torch.int32) << 16) |
+              (byte_tensor[:, 3].to(torch.int32) << 24))
+
+    return packed.view(fp8_tensor.shape[0] // 4,
+                       *fp8_tensor.shape[1:]).contiguous()
-- 
GitLab


From 3c6325f0fcfc46e573a107c9435abba6b6a617e8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 3 Jul 2024 14:41:32 -0700
Subject: [PATCH 253/376] [core][distributed] custom allreduce when pp size > 1
 (#6117)

---
 vllm/config.py                     | 16 +++++-----------
 vllm/distributed/parallel_state.py | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 24f536c04..1eb5e1045 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -723,17 +723,11 @@ class ParallelConfig:
         if self.distributed_executor_backend == "ray":
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
-        if not self.disable_custom_all_reduce and self.world_size > 1:
-            if is_hip():
-                self.disable_custom_all_reduce = True
-                logger.info(
-                    "Disabled the custom all-reduce kernel because it is not "
-                    "supported on AMD GPUs.")
-            elif self.pipeline_parallel_size > 1:
-                self.disable_custom_all_reduce = True
-                logger.info(
-                    "Disabled the custom all-reduce kernel because it is not "
-                    "supported with pipeline parallelism.")
+        if is_hip():
+            self.disable_custom_all_reduce = True
+            logger.info(
+                "Disabled the custom all-reduce kernel because it is not "
+                "supported on AMD GPUs.")
         if self.ray_workers_use_nsight and (
                 not self.distributed_executor_backend == "ray"):
             raise ValueError("Unable to use nsight profiling unless workers "
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index faf9177ad..66ffe6e8a 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -719,14 +719,19 @@ def init_world_group(ranks: List[int], local_rank: int,
     )
 
 
-def init_model_parallel_group(group_ranks: List[List[int]], local_rank: int,
-                              backend: str) -> GroupCoordinator:
+def init_model_parallel_group(
+        group_ranks: List[List[int]],
+        local_rank: int,
+        backend: str,
+        use_custom_allreduce: Optional[bool] = None) -> GroupCoordinator:
+    if use_custom_allreduce is None:
+        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
         use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+        use_custom_allreduce=use_custom_allreduce,
     )
 
 
@@ -888,8 +893,11 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
+    # pipeline parallel does not need custom allreduce
     _PP = init_model_parallel_group(group_ranks,
-                                    get_world_group().local_rank, backend)
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_custom_allreduce=False)
 
 
 def ensure_model_parallel_initialized(
-- 
GitLab


From d9e98f42e434d1f1a0f8ceed363047060aca6262 Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Wed, 3 Jul 2024 15:14:16 -0700
Subject: [PATCH 254/376] [vlm] Remove vision language config. (#6089)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 .../dev/multimodal/multimodal_index.rst       |  5 ++
 docs/source/models/vlm.rst                    | 78 +++++++++----------
 examples/llava_example.py                     |  7 +-
 examples/llava_next_example.py                |  8 +-
 examples/openai_vision_api_client.py          |  3 -
 examples/phi3v_example.py                     |  6 +-
 .../distributed/test_multimodal_broadcast.py  |  6 +-
 tests/entrypoints/openai/test_vision.py       |  6 --
 tests/models/test_llava.py                    | 60 ++++----------
 tests/models/test_llava_next.py               | 54 ++++---------
 tests/models/test_phi3v.py                    | 54 ++++---------
 vllm/config.py                                | 38 ++-------
 vllm/engine/arg_utils.py                      | 59 ++------------
 vllm/engine/llm_engine.py                     | 16 ++--
 vllm/entrypoints/llm.py                       |  5 ++
 vllm/entrypoints/openai/serving_chat.py       | 21 ++---
 vllm/executor/cpu_executor.py                 |  2 +-
 vllm/executor/executor_base.py                | 13 ++--
 vllm/executor/gpu_executor.py                 |  2 +-
 vllm/executor/openvino_executor.py            |  2 +-
 vllm/executor/ray_xpu_executor.py             | 10 +--
 vllm/executor/tpu_executor.py                 |  2 +-
 vllm/executor/xpu_executor.py                 |  8 +-
 vllm/inputs/registry.py                       |  4 +-
 vllm/model_executor/model_loader/__init__.py  |  8 +-
 vllm/model_executor/model_loader/loader.py    | 48 ++++++------
 vllm/model_executor/models/interfaces.py      |  6 +-
 vllm/model_executor/models/llava.py           | 21 +++--
 vllm/model_executor/models/llava_next.py      | 53 +++++++++++--
 vllm/model_executor/models/phi3v.py           | 62 +++++++++++----
 vllm/multimodal/registry.py                   |  7 ++
 vllm/spec_decode/draft_model_runner.py        |  8 +-
 vllm/worker/cpu_model_runner.py               | 25 +++---
 vllm/worker/cpu_worker.py                     | 10 +--
 vllm/worker/embedding_model_runner.py         |  8 +-
 vllm/worker/model_runner.py                   | 28 ++++---
 vllm/worker/openvino_model_runner.py          |  8 +-
 vllm/worker/openvino_worker.py                | 10 +--
 vllm/worker/tpu_model_runner.py               |  8 +-
 vllm/worker/tpu_worker.py                     |  8 +-
 vllm/worker/worker.py                         | 13 ++--
 vllm/worker/xpu_model_runner.py               | 25 +++---
 vllm/worker/xpu_worker.py                     | 13 ++--
 43 files changed, 372 insertions(+), 466 deletions(-)

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index d01f39284..c2d1b771e 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -10,8 +10,13 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 :class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
 which allows you to pass in multi-modal input alongside text and token prompts.
 
+.. note::
+   ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through 
+    :class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
+
 By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`the guide for adding a new multimodal model. <adding_a_new_multimodal_model>`.
 
+
 # TODO: Add more instructions on how to do that once embeddings is in.
 
 Guides
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index f8c61018a..f9e5dbea1 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -8,18 +8,6 @@ vLLM provides experimental support for Vision Language Models (VLMs). This docum
 .. important::
     We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
 
-Engine Arguments
-----------------
-
-The following :ref:`engine arguments <engine_args>` are specific to VLMs:
-
-.. argparse::
-    :module: vllm.engine.arg_utils
-    :func: _vlm_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
-    :nodefaultconst:
-
-.. important::
     Currently, the support for vision language models on vLLM has the following limitations:
 
     * Only single image input is supported per text prompt.
@@ -33,20 +21,17 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
 .. code-block:: python
 
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_token_id=32000,
-        image_input_shape="1,3,336,336",
-        image_feature_size=576,
-    )
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
 .. important::
-    Currently, you have to specify ``image_feature_size`` to support memory profiling.
-    To avoid OOM during runtime, you should set this to the maximum value supported by the model.
-    The calculation of feature size is specific to the model. For more details, please refer to
-    the function :code:`get_<model_name>_image_feature_size` inside the corresponding model file.
+    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
+    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified, and internally we will construct data structures for
+    every model to perform profiling with.
 
-    We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
+    This work is still ongoing. In the meantime, we internally hardcode ``image_feature_size = 3000`` through 
+    :meth:`MULTIMODAL_REGISTRY.get_num_input_tokens <vllm.multimodal.MultiModalRegistry.get_num_input_tokens>` 
+    for every model to be conservative in terms of GPU memory consumption. This hardcoded value will be replaced 
+    with a more accurate profiling strategy in the future.
 
 
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
@@ -54,19 +39,15 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 * ``prompt``: The prompt should follow the format that is documented on HuggingFace.
 * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
 
-.. note::
-
-   ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
-    :class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
-
 .. code-block:: python
 
     # Refer to the HuggingFace repo for the correct format to use
     prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     # Load the image using PIL.Image
-    image = ...
-
+    image = PIL.Image.open(...)
+    
+    # Single prompt inference
     outputs = llm.generate({
         "prompt": prompt,
         "multi_modal_data": {"image": image},
@@ -75,6 +56,26 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+    
+    # Batch inference
+    image_1 = PIL.Image.open(...)
+    image_2 = PIL.Image.open(...)
+    outputs = llm.generate(
+        [
+            {
+                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_1},
+            },
+            {
+                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_2},
+            }
+        ]
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
 
@@ -99,18 +100,17 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
 
     python -m vllm.entrypoints.openai.api_server \
         --model llava-hf/llava-1.5-7b-hf \
-        --image-token-id 32000 \
-        --image-input-shape 1,3,336,336 \
-        --image-feature-size 576 \
         --chat-template template_llava.jinja
 
 .. important::
-    Currently, you have to specify ``image_feature_size`` to support memory profiling.
-    To avoid OOM during runtime, you should set this to the maximum value supported by the model.
-    The calculation of feature size is specific to the model. For more details, please refer to
-    the function :code:`get_<model_name>_image_feature_size` inside the corresponding model file.
-
-    We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
+    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
+    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified, and internally we will construct data structures for
+    every model to perform profiling with.
+
+    This work is still ongoing. In the meantime, we internally hardcode ``image_feature_size = 3000`` through 
+    :meth:`MULTIMODAL_REGISTRY.get_num_input_tokens <vllm.multimodal.MultiModalRegistry.get_num_input_tokens>` 
+    for every model to be conservative in terms of GPU memory consumption. This hardcoded value will be replaced 
+    with a more accurate profiling strategy in the future.
 
 To consume the server, you can use the OpenAI client like in the example below:
 
diff --git a/examples/llava_example.py b/examples/llava_example.py
index f5cb2a661..382d153cf 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -10,12 +10,7 @@ from vllm import LLM
 
 
 def run_llava():
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_token_id=32000,
-        image_input_shape="1,3,336,336",
-        image_feature_size=576,
-    )
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
     prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
index 20d4791ff..fd53a6def 100644
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -7,13 +7,7 @@ from vllm import LLM, SamplingParams
 
 
 def run_llava_next():
-    llm = LLM(
-        model="llava-hf/llava-v1.6-mistral-7b-hf",
-        image_token_id=32000,
-        image_input_shape="1,3,336,336",
-        # Use the maximum possible value for memory profiling
-        image_feature_size=2928,
-    )
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096)
 
     prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
     url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index fcda1345f..d4d9738a1 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -3,9 +3,6 @@
 Launch the vLLM server with the following command:
 python -m vllm.entrypoints.openai.api_server \
     --model llava-hf/llava-1.5-7b-hf \
-    --image-token-id 32000 \
-    --image-input-shape 1,3,336,336 \
-    --image-feature-size 576 \
     --chat-template template_llava.jinja
 """
 import base64
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 0aabfee6a..b605d4c6d 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -14,15 +14,13 @@ def run_phi3v():
 
     # Note: The default setting of max_num_seqs (256) and
     # max_model_len (128k) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
     # In this example, we override max_num_seqs to 5 while
     # keeping the original context length of 128k.
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
-        image_token_id=32044,
-        image_input_shape="1,3,1008,1344",
-        # Use the maximum possible value for memory profiling
-        image_feature_size=2653,
         max_num_seqs=5,
     )
 
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
index 1d143a852..8e0e8ecd6 100644
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -20,9 +20,9 @@ from vllm.utils import cuda_device_count_stateless
 model = os.environ["TEST_DIST_MODEL"]
 
 if model.startswith("llava-hf/llava"):
-    from ..models.test_llava import model_and_vl_config, run_test
+    from ..models.test_llava import models, run_test
 elif model.startswith("microsoft/Phi-3-vision"):
-    from ..models.test_phi3v import model_and_vl_config, run_test
+    from ..models.test_phi3v import models, run_test
 else:
     raise NotImplementedError(f"Unsupported model: {model}")
 
@@ -44,7 +44,7 @@ def test_models(hf_runner, vllm_runner, image_assets,
         hf_runner,
         vllm_runner,
         image_assets,
-        model_and_config=model_and_vl_config[0],
+        model=models[0],
         size_factors=[1.0],
         dtype=dtype,
         max_tokens=max_tokens,
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 7200b94f8..b86971760 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -39,12 +39,6 @@ def server(ray_ctx):
         "--max-model-len",
         "4096",
         "--enforce-eager",
-        "--image-token-id",
-        "32000",
-        "--image-input-shape",
-        "1,3,336,336",
-        "--image-feature-size",
-        "576",
         "--chat-template",
         str(LLAVA_CHAT_TEMPLATE),
     ])
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 2f4b85bc1..2c0a8d4ff 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -3,7 +3,6 @@ from typing import List, Optional, Tuple, Type
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import VisionLanguageConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
@@ -21,49 +20,27 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "USER: <image>\nWhat's in this image?\nASSISTANT:",
 })
 
+IMAGE_TOKEN_ID = 32000
 
-def iter_llava_configs(model_name: str):
-    image_hw_to_feature_size = {
-        (336, 336): 576,
-    }
-
-    for (h, w), f in image_hw_to_feature_size.items():
-        input_shape = (1, 3, h, w)
-        yield (model_name,
-               VisionLanguageConfig(image_feature_size=f,
-                                    image_token_id=32000,
-                                    image_input_shape=input_shape))
-
-
-model_and_vl_config = [
-    *iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
-]
+models = ["llava-hf/llava-1.5-7b-hf"]
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
-                      vlm_config: VisionLanguageConfig, model_id: str):
-    """Sanitize vllm output to be comparable with hf output.
-    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
-    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from "<image><image>bla" to "bla".
-    """
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
-    image_token_id = vlm_config.image_token_id
 
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
+    tokenizer = AutoTokenizer.from_pretrained(model)
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
-        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
+        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
     ]
 
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, "")
-    assert hf_output_str[0] == " "
-    hf_output_str = hf_output_str[1:]
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
     if hf_output_ids[-1] == eos_token_id:
         hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
 
@@ -74,7 +51,7 @@ def run_test(
     hf_runner: Type[HfRunner],
     vllm_runner: Type[VllmRunner],
     image_assets: _ImageAssets,
-    model_and_config: Tuple[str, VisionLanguageConfig],
+    model: str,
     *,
     size_factors: List[float],
     dtype: str,
@@ -92,7 +69,6 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    model_id, vlm_config = model_and_config
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -106,12 +82,11 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(model_id,
+    with vllm_runner(model,
                      dtype=dtype,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
+                     enforce_eager=True) as vllm_model:
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
@@ -120,7 +95,7 @@ def run_test(
             for prompts, images in inputs_per_image
         ]
 
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
@@ -136,7 +111,7 @@ def run_test(
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                vllm_to_hf_output(vllm_output, model)
                 for vllm_output in vllm_outputs
             ],
             name_0="hf",
@@ -144,7 +119,7 @@ def run_test(
         )
 
 
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
     [
@@ -161,14 +136,13 @@ def run_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                size_factors, dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
     run_test(
         hf_runner,
         vllm_runner,
         image_assets,
-        model_and_config,
+        model,
         size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 8817f41a6..bf911b5c6 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -4,7 +4,6 @@ from typing import List, Optional, Tuple
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import VisionLanguageConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
@@ -27,46 +26,22 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     f"{_PREFACE} USER: <image>\nWhat's in this image? ASSISTANT:",
 })
 
-
-def iter_llava_next_configs(model_name: str):
-    # Need to use the max possible feature size for profile_run
-    image_hw_to_feature_size = {
-        (336, 336): 2928,
-    }
-
-    for (h, w), f in image_hw_to_feature_size.items():
-        input_shape = (1, 3, h, w)
-        yield (model_name,
-               VisionLanguageConfig(
-                   image_feature_size=f,
-                   image_token_id=32000,
-                   image_input_shape=input_shape,
-               ))
-
-
-model_and_vl_config = [
-    *iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"),
-]
+IMAGE_TOKEN_ID = 32000
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
-                      vlm_config: VisionLanguageConfig, model_id: str):
-    """Sanitize vllm output to be comparable with hf output.
-    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
-    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from "<image><image>bla" to "bla".
-    """
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
-    image_token_id = vlm_config.image_token_id
 
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    image_token_str = tokenizer.decode(IMAGE_TOKEN_ID)
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
-        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
+        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
     ]
 
     hf_output_str = re.sub(fr"({image_token_str})+", "", output_str)
@@ -78,7 +53,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
 @pytest.mark.parametrize(
     "size_factors",
     [
@@ -95,9 +70,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                size_factors, dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -107,7 +81,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    model_id, vlm_config = model_and_config
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -116,11 +89,10 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(model_id,
+    with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=4096,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
+                     enforce_eager=True) as vllm_model:
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
@@ -129,7 +101,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             for prompts, images in inputs_per_image
         ]
 
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
@@ -145,7 +117,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                vllm_to_hf_output(vllm_output, model)
                 for vllm_output in vllm_outputs
             ],
             name_0="hf",
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index f144f9755..cb32a047a 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -4,7 +4,6 @@ from typing import List, Optional, Tuple, Type
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import VisionLanguageConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
@@ -23,35 +22,14 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "<|user|>\n<|image_1|>\nWhat's in this image?<|end|>\n<|assistant|>\n",
 })
 
-
-def iter_phi3v_configs(model_name: str):
-    # Need to use the max possible feature size for profile_run
-    image_hw_to_feature_size = {
-        (1008, 1344): 2653,
-    }
-
-    for (h, w), f in image_hw_to_feature_size.items():
-        input_shape = (1, 3, h, w)
-        yield (model_name,
-               VisionLanguageConfig(image_feature_size=f,
-                                    image_token_id=32044,
-                                    image_input_shape=input_shape))
-
-
-model_and_vl_config = [
-    *iter_phi3v_configs("microsoft/Phi-3-vision-128k-instruct"),
-]
+models = ["microsoft/Phi-3-vision-128k-instruct"]
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
-                      vlm_config: VisionLanguageConfig, model_id: str):
-    """Sanitize vllm output to be comparable with hf output.
-    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
-    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from "<image><image>bla" to "bla".
-    """
-    output_ids, output_str, out_logprobs = vllm_output
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
 
     output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
     assert output_str_without_image[0] == " "
@@ -60,7 +38,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     hf_output_str = output_str_without_image.replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
 
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer = AutoTokenizer.from_pretrained(model)
     hf_output_ids = tokenizer.encode(output_str_without_image)
     assert hf_output_ids[0] == 1
     hf_output_ids = hf_output_ids[1:]
@@ -77,7 +55,7 @@ def run_test(
     hf_runner: Type[HfRunner],
     vllm_runner: Type[VllmRunner],
     image_assets: _ImageAssets,
-    model_and_config: Tuple[str, VisionLanguageConfig],
+    model: str,
     *,
     size_factors: List[float],
     dtype: str,
@@ -95,7 +73,6 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    model_id, vlm_config = model_and_config
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -109,13 +86,13 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(model_id,
+    with vllm_runner(model,
                      max_model_len=4096,
+                     max_num_seqs=1,
                      dtype=dtype,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
+                     enforce_eager=True) as vllm_model:
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
@@ -126,7 +103,7 @@ def run_test(
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model_id, dtype=dtype,
+    with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
         eos_token_id = hf_model.processor.tokenizer.eos_token_id
         hf_outputs_per_image = [
@@ -143,7 +120,7 @@ def run_test(
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                vllm_to_hf_output(vllm_output, model)
                 for vllm_output in vllm_outputs
             ],
             name_0="hf",
@@ -153,7 +130,7 @@ def run_test(
 
 # Since we use _attn_implementation="eager" for hf_runner, there is more
 # significant numerical difference. The basic `logprobs=5` fails to pass.
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
     [
@@ -170,14 +147,13 @@ def run_test(
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                size_factors, dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
     run_test(
         hf_runner,
         vllm_runner,
         image_assets,
-        model_and_config,
+        model,
         size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
diff --git a/vllm/config.py b/vllm/config.py
index 1eb5e1045..0004622ce 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,8 +1,7 @@
 import enum
 import json
 from dataclasses import dataclass, field, fields
-from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple,
-                    Union)
+from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Union
 
 import torch
 from transformers import PretrainedConfig
@@ -120,7 +119,7 @@ class ModelConfig:
         disable_sliding_window: bool = False,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, List[str]]] = None,
-        multimodal_config: Optional["VisionLanguageConfig"] = None,
+        multimodal_config: Optional["MultiModalConfig"] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -1289,35 +1288,12 @@ class LoRAConfig:
             raise ValueError("LoRA is not supported with chunked prefill yet.")
 
 
-# TODO: To be replaced by MultiModalConfig.
 @dataclass
-class VisionLanguageConfig:
+class MultiModalConfig:
     """Configs the input data format and how models should run for
-    vision language models."""
-    # The input id corresponding to image token.
-    image_token_id: int
-    # Used for running `run_prefill_max_token`.
-    # For models that support varying resolution, this corresponds to
-    # worst case scenario (biggest supported resolution).
-    image_input_shape: tuple
-    image_feature_size: int
-
-    def as_cli_args_dict(self) -> Dict[str, Any]:
-        """Flatten vision language config to pure args.
-
-        Compatible with what llm entrypoint expects.
-        """
-        result: Dict[str, Any] = {}
-        for f in fields(self):
-            value = getattr(self, f.name)
-            if isinstance(value, enum.Enum):
-                result[f.name] = value.name.lower()
-            elif isinstance(value, tuple):
-                result[f.name] = ",".join([str(item) for item in value])
-            else:
-                result[f.name] = value
-
-        return result
+    multimodal models."""
+    # TODO: Add configs to init vision tower or not.
+    pass
 
 
 _STR_DTYPE_TO_TORCH_DTYPE = {
@@ -1541,7 +1517,7 @@ class EngineConfig:
     device_config: DeviceConfig
     load_config: LoadConfig
     lora_config: Optional[LoRAConfig]
-    vision_language_config: Optional[VisionLanguageConfig]
+    multimodal_config: Optional[MultiModalConfig]
     speculative_config: Optional[SpeculativeConfig]
     decoding_config: Optional[DecodingConfig]
     observability_config: Optional[ObservabilityConfig]
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 565b9e779..afa6892d4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -6,11 +6,11 @@ from typing import List, Optional, Tuple, Union
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, TokenizerPoolConfig,
-                         VisionLanguageConfig)
+                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig,
+                         TokenizerPoolConfig)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser, str_to_int_tuple
+from vllm.utils import FlexibleArgumentParser
 
 
 def nullable_str(val: str):
@@ -78,11 +78,6 @@ class EngineArgs:
     model_loader_extra_config: Optional[dict] = None
     preemption_mode: Optional[str] = None
 
-    # Related to Vision-language models such as llava
-    image_token_id: Optional[int] = None
-    image_input_shape: Optional[str] = None
-    image_feature_size: Optional[int] = None
-
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
 
@@ -106,27 +101,6 @@ class EngineArgs:
         if self.tokenizer is None:
             self.tokenizer = self.model
 
-    @staticmethod
-    def add_cli_args_for_vlm(
-            parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
-        parser.add_argument('--image-token-id',
-                            type=int,
-                            default=None,
-                            help=('Input id for image token.'))
-        parser.add_argument(
-            '--image-input-shape',
-            type=nullable_str,
-            default=None,
-            help=('The biggest image input shape (worst for memory footprint) '
-                  'given an input type. Only used for vLLM\'s profile_run.'))
-        parser.add_argument(
-            '--image-feature-size',
-            type=int,
-            default=None,
-            help=('The image feature size along the context dimension.'))
-
-        return parser
-
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
@@ -484,9 +458,6 @@ class EngineArgs:
                             ],
                             help='Device type for vLLM execution.')
 
-        # Related to Vision-language models such as llava
-        parser = EngineArgs.add_cli_args_for_vlm(parser)
-
         parser.add_argument(
             '--scheduler-delay-factor',
             type=float,
@@ -648,19 +619,7 @@ class EngineArgs:
             raise ValueError(
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
-        if self.image_token_id is not None:
-            if (not self.image_input_shape or not self.image_feature_size):
-                raise ValueError(
-                    'Specify `image_input_shape` and '
-                    '`image_feature_size` together with `image_token_id`.')
-
-            vision_language_config = VisionLanguageConfig(
-                image_token_id=self.image_token_id,
-                image_input_shape=str_to_int_tuple(self.image_input_shape),
-                image_feature_size=self.image_feature_size,
-            )
-        else:
-            vision_language_config = None
+        multimodal_config = MultiModalConfig()
 
         device_config = DeviceConfig(device=self.device)
         model_config = ModelConfig(
@@ -685,7 +644,7 @@ class EngineArgs:
             disable_sliding_window=self.disable_sliding_window,
             skip_tokenizer_init=self.skip_tokenizer_init,
             served_model_name=self.served_model_name,
-            multimodal_config=vision_language_config)
+            multimodal_config=multimodal_config)
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
@@ -787,7 +746,7 @@ class EngineArgs:
             scheduler_config=scheduler_config,
             device_config=device_config,
             lora_config=lora_config,
-            vision_language_config=vision_language_config,
+            multimodal_config=multimodal_config,
             speculative_config=speculative_config,
             load_config=load_config,
             decoding_config=decoding_config,
@@ -831,7 +790,3 @@ def _engine_args_parser():
 def _async_engine_args_parser():
     return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
                                         async_args_only=True)
-
-
-def _vlm_engine_args_parser():
-    return EngineArgs.add_cli_args_for_vlm(FlexibleArgumentParser())
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a7428d010..de7604ece 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -7,9 +7,9 @@ from typing import Set, Type, TypeVar, Union
 from transformers import PreTrainedTokenizer
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
-                         LoRAConfig, ModelConfig, ObservabilityConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VisionLanguageConfig)
+                         LoRAConfig, ModelConfig, MultiModalConfig,
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
 from vllm.engine.arg_utils import EngineArgs
@@ -87,8 +87,8 @@ class LLMEngine:
         scheduler_config: The configuration related to the request scheduler.
         device_config: The configuration related to the device.
         lora_config (Optional): The configuration related to serving multi-LoRA.
-        vision_language_config (Optional): The configuration related to vision
-            language models.
+        multimodal_config (Optional): The configuration related to multimodal 
+            models.
         speculative_config (Optional): The configuration related to speculative
             decoding.
         executor_class: The model executor class for managing distributed
@@ -157,7 +157,7 @@ class LLMEngine:
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
         decoding_config: Optional[DecodingConfig],
         observability_config: Optional[ObservabilityConfig],
@@ -215,7 +215,7 @@ class LLMEngine:
         self.model_config = model_config
         self.cache_config = cache_config
         self.lora_config = lora_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
@@ -247,7 +247,7 @@ class LLMEngine:
             scheduler_config=scheduler_config,
             device_config=device_config,
             lora_config=lora_config,
-            vision_language_config=vision_language_config,
+            multimodal_config=multimodal_config,
             speculative_config=speculative_config,
             load_config=load_config,
         )
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9e9234931..e3e506d49 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -121,6 +121,11 @@ class LLM:
     ) -> None:
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
+        removed_vision_keys = ("image_token_id", "image_feature_size",
+                               "image_input_shape", "image_input_type")
+        if any(k in kwargs for k in removed_vision_keys):
+            raise TypeError(
+                "There is no need to pass vision-related arguments anymore.")
         engine_args = EngineArgs(
             model=model,
             tokenizer=tokenizer,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 06c82d5e8..415bdbbd7 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -109,23 +109,12 @@ class OpenAIServingChat(OpenAIServing):
                           "paligemma"):
             # These models do not use image tokens in the prompt
             return None
+        if model_type.startswith("llava"):
+            return self.tokenizer.decode(
+                self.model_config.hf_config.image_token_index)
 
-        # The default behaviour assumes that the image token is
-        # available to the tokenizer.
-        # (Suitable for LLaVA, Idefics2, DeepSeek-VL)
-        vlm_config = self.model_config.multimodal_config
-        if vlm_config is None:
-            raise ValueError(
-                "'image_url' input is not supported as the loaded "
-                "model is not multimodal.")
-
-        image_token_id = vlm_config.image_token_id
-        if vlm_config.image_token_id is None:
-            raise ValueError(
-                "'image_url' input is not supported as the loaded "
-                "model does not specify an image token.")
-
-        return self.tokenizer.decode(image_token_id)
+        else:
+            raise TypeError("Unknown model type: {model_type}")
 
     # TODO: Let user specify how to insert image tokens into prompt
     # (similar to chat template)
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 6137cecd8..3b5621f70 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -46,7 +46,7 @@ class CPUExecutor(ExecutorBase):
             rank=0,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 9018c3295..2abb29c14 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -3,8 +3,8 @@ from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig)
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 
@@ -26,7 +26,7 @@ class ExecutorBase(ABC):
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         self.model_config = model_config
@@ -36,7 +36,7 @@ class ExecutorBase(ABC):
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.speculative_config = speculative_config
 
         self._init_executor()
@@ -120,7 +120,7 @@ class ExecutorAsyncBase(ExecutorBase):
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         # This locks each pipeline parallel stage so multiple virtual engines
@@ -132,8 +132,7 @@ class ExecutorAsyncBase(ExecutorBase):
 
         super().__init__(model_config, cache_config, parallel_config,
                          scheduler_config, device_config, load_config,
-                         lora_config, vision_language_config,
-                         speculative_config)
+                         lora_config, multimodal_config, speculative_config)
 
     @abstractmethod
     async def execute_model_async(
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index c2910ccdc..7d3183a42 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -43,7 +43,7 @@ class GPUExecutor(ExecutorBase):
             rank=rank,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             speculative_config=self.speculative_config,
             is_driver_worker=(not self.parallel_config)
             or (rank % self.parallel_config.tensor_parallel_size == 0),
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index 8af375371..697d698b4 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -47,7 +47,7 @@ class OpenVINOExecutor(ExecutorBase):
             rank=0,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
index dd7c82289..f02d49783 100644
--- a/vllm/executor/ray_xpu_executor.py
+++ b/vllm/executor/ray_xpu_executor.py
@@ -7,8 +7,8 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
                     Tuple, Union)
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig)
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
@@ -43,7 +43,7 @@ class RayXPUExecutor(DistributedGPUExecutor):
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         assert device_config.device_type == "xpu"
@@ -57,7 +57,7 @@ class RayXPUExecutor(DistributedGPUExecutor):
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
 
         placement_group = self.parallel_config.placement_group
 
@@ -199,7 +199,7 @@ class RayXPUExecutor(DistributedGPUExecutor):
                     rank=rank,
                     distributed_init_method=distributed_init_method,
                     lora_config=self.lora_config,
-                    vision_language_config=self.vision_language_config,
+                    multimodal_config=self.multimodal_config,
                     is_driver_worker=rank == 0,
                 ))
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 7fe5349c9..6627ee698 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -50,7 +50,7 @@ class TPUExecutor(ExecutorBase):
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             is_driver_worker=rank == 0,
         )
 
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
index d37200bd0..29b246332 100644
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
@@ -3,8 +3,8 @@ from typing import List, Optional
 import torch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
@@ -26,7 +26,7 @@ class XPUExecutor(GPUExecutor):
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         assert device_config.device_type == "xpu"
@@ -42,7 +42,7 @@ class XPUExecutor(GPUExecutor):
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.speculative_config = None
 
         # Instantiate the worker and load the model to GPU.
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 936909eb3..2c87e3d92 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -11,7 +11,7 @@ from vllm.logger import init_logger
 from .data import LLMInputs
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, VisionLanguageConfig
+    from vllm.config import ModelConfig, MultiModalConfig
     from vllm.multimodal import MultiModalDataDict
     from vllm.sequence import SequenceData
 
@@ -30,7 +30,7 @@ class InputContext:
     model_config: "ModelConfig"
     """The configuration of the model."""
 
-    def get_multimodal_config(self) -> "VisionLanguageConfig":
+    def get_multimodal_config(self) -> "MultiModalConfig":
         """
         Get the multimodal configuration of the model.
 
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index e3e32d61a..d10107a7f 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -3,8 +3,8 @@ from typing import Optional
 from torch import nn
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.model_executor.model_loader.loader import (BaseModelLoader,
                                                      get_model_loader)
 from vllm.model_executor.model_loader.utils import (
@@ -15,13 +15,13 @@ def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
               device_config: DeviceConfig, parallel_config: ParallelConfig,
               scheduler_config: SchedulerConfig,
               lora_config: Optional[LoRAConfig],
-              vision_language_config: Optional[VisionLanguageConfig],
+              multimodal_config: Optional[MultiModalConfig],
               cache_config: CacheConfig) -> nn.Module:
     loader = get_model_loader(load_config)
     return loader.load_model(model_config=model_config,
                              device_config=device_config,
                              lora_config=lora_config,
-                             vision_language_config=vision_language_config,
+                             multimodal_config=multimodal_config,
                              parallel_config=parallel_config,
                              scheduler_config=scheduler_config,
                              cache_config=cache_config)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 6f4dcf4a0..605479650 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -16,8 +16,8 @@ from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VisionLanguageConfig)
+                         LoRAConfig, ModelConfig, MultiModalConfig,
+                         ParallelConfig, SchedulerConfig)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
@@ -68,7 +68,7 @@ def _get_quantization_config(
 def _get_model_initialization_kwargs(
     model_class: Type[nn.Module],
     lora_config: Optional[LoRAConfig],
-    vlm_config: Optional[VisionLanguageConfig],
+    multimodal_config: Optional[MultiModalConfig],
 ) -> Dict[str, Any]:
     """Get extra kwargs for model initialization."""
     extra_kwargs: Dict[str, Any] = {}
@@ -84,18 +84,18 @@ def _get_model_initialization_kwargs(
             "please open an issue on github.")
 
     if supports_vision(model_class):
-        if vlm_config is None:
+        if multimodal_config is None:
             raise ValueError("Provide vision related configurations "
                              "through LLM entrypoint or engine arguments.")
 
-        extra_kwargs["vlm_config"] = vlm_config
+        extra_kwargs["multimodal_config"] = multimodal_config
 
     return extra_kwargs
 
 
 def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
                       lora_config: Optional[LoRAConfig],
-                      vision_language_config: Optional[VisionLanguageConfig],
+                      multimodal_config: Optional[MultiModalConfig],
                       cache_config: CacheConfig) -> nn.Module:
     """Initialize a model with the given configurations."""
     model_class = get_model_architecture(model_config)[0]
@@ -105,7 +105,7 @@ def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
                        cache_config=cache_config,
                        quant_config=quant_config,
                        **_get_model_initialization_kwargs(
-                           model_class, lora_config, vision_language_config))
+                           model_class, lora_config, multimodal_config))
 
 
 class BaseModelLoader(ABC):
@@ -118,7 +118,7 @@ class BaseModelLoader(ABC):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
@@ -258,14 +258,14 @@ class DefaultModelLoader(BaseModelLoader):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                           cache_config)
             model.load_weights(
                 self._get_weights_iterator(model_config.model,
@@ -298,14 +298,14 @@ class DummyModelLoader(BaseModelLoader):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                           cache_config)
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
@@ -339,7 +339,7 @@ class TensorizerLoader(BaseModelLoader):
         model_config: ModelConfig,
         device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         cache_config: CacheConfig,
     ) -> nn.Module:
         """Load a serialized model with tensorizer to the CPU.
@@ -352,7 +352,7 @@ class TensorizerLoader(BaseModelLoader):
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                           cache_config)
 
             model.load_weights(self._get_weights_iterator())
@@ -363,7 +363,7 @@ class TensorizerLoader(BaseModelLoader):
         model_config: ModelConfig,
         device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         cache_config: CacheConfig,
     ) -> nn.Module:
         """Load a serialized model with tensorizer.
@@ -377,7 +377,7 @@ class TensorizerLoader(BaseModelLoader):
                 quant_config = _get_quantization_config(
                     model_config, self.load_config)
                 extra_kwargs = _get_model_initialization_kwargs(
-                    model_class, lora_config, vision_language_config)
+                    model_class, lora_config, multimodal_config)
                 extra_kwargs["quant_config"] = quant_config
                 extra_kwargs["cache_config"] = cache_config
 
@@ -392,7 +392,7 @@ class TensorizerLoader(BaseModelLoader):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
@@ -406,12 +406,10 @@ class TensorizerLoader(BaseModelLoader):
 
         if is_vllm_tensorized(self.tensorizer_config):
             return self._load_model_serialized(model_config, device_config,
-                                               lora_config,
-                                               vision_language_config,
+                                               lora_config, multimodal_config,
                                                cache_config)
         return self._load_model_serialized_cpu(model_config, device_config,
-                                               lora_config,
-                                               vision_language_config,
+                                               lora_config, multimodal_config,
                                                cache_config)
 
     @staticmethod
@@ -494,7 +492,7 @@ class ShardedStateLoader(BaseModelLoader):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
@@ -508,7 +506,7 @@ class ShardedStateLoader(BaseModelLoader):
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                           cache_config)
             rank = get_tensor_model_parallel_rank()
             pattern = os.path.join(
@@ -804,14 +802,14 @@ class BitsAndBytesModelLoader(BaseModelLoader):
     def load_model(self, *, model_config: ModelConfig,
                    device_config: DeviceConfig,
                    lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                           cache_config)
 
                 self._load_weights(model_config, model)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index cb0fc154a..2697a6996 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -3,7 +3,7 @@ from typing import (ClassVar, Dict, List, Literal, Optional, Protocol, Type,
 
 from typing_extensions import TypeGuard
 
-from vllm.config import LoRAConfig, VisionLanguageConfig
+from vllm.config import LoRAConfig, MultiModalConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -22,7 +22,7 @@ class SupportsVision(Protocol):
         MRO of your model class.
     """
 
-    def __init__(self, *, vlm_config: VisionLanguageConfig) -> None:
+    def __init__(self, *, multimodal_config: MultiModalConfig) -> None:
         ...
 
 
@@ -32,7 +32,7 @@ class SupportsVision(Protocol):
 class _SupportsVisionType(Protocol):
     supports_vision: Literal[True]
 
-    def __call__(self, *, vlm_config: VisionLanguageConfig) -> None:
+    def __call__(self, *, multimodal_config: MultiModalConfig) -> None:
         ...
 
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 2588d8b06..526b080bf 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -5,7 +5,7 @@ import torch.nn as nn
 from transformers import CLIPVisionConfig, LlavaConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -108,13 +108,13 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: LlavaConfig,
-                 vlm_config: VisionLanguageConfig,
+                 multimodal_config: MultiModalConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__()
 
         self.config = config
-        self.vlm_config = vlm_config
+        self.multimodal_config = multimodal_config
 
         # TODO: Optionally initializes this for supporting embeddings.
         self.vision_tower = CLIPVisionModel(config.vision_config)
@@ -138,14 +138,13 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
         self.sampler = Sampler()
 
     def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor:
-        if list(data.shape[1:]) != list(self.vlm_config.image_input_shape[1:]):
+        if list(data.shape)[1:] != [
+                3, self.config.vision_config.image_size,
+                self.config.vision_config.image_size
+        ]:
             raise ValueError(
-                f"The expected image tensor shape is batch dimension plus "
-                f"{self.vlm_config.image_input_shape[1:]}. "
-                f"You supplied {data.shape}. "
-                f"If you are using vLLM's entrypoint, make sure your "
-                f"supplied image input is consistent with "
-                f"image_input_shape in engine args.")
+                "The expected image tensor shape is batch dimension plus "
+                "channel, height and width.")
 
         return data
 
@@ -244,7 +243,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
             inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
-                self.vlm_config.image_token_id)
+                self.config.image_token_index)
 
             input_ids = None
         else:
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 92604cdf3..4b03a5f9f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -9,7 +9,7 @@ from transformers.models.llava_next.modeling_llava_next import (
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -204,13 +204,13 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: LlavaNextConfig,
-                 vlm_config: VisionLanguageConfig,
+                 multimodal_config: MultiModalConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__()
 
         self.config = config
-        self.vlm_config = vlm_config
+        self.multimodal_config = multimodal_config
 
         # TODO: Optionally initializes this for supporting embeddings.
         self.vision_tower = CLIPVisionModel(config=config.vision_config)
@@ -244,6 +244,47 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
         return data
 
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        def _validate_shape(data: torch.Tensor):
+
+            dim = data.dim()
+            height = width = self.config.vision_config.image_size
+            # All 4d image tensors have the same number of patches,
+            # so data is a 5d batch of these tensors
+            if dim == 5:
+                if list(data.shape)[2:] != [
+                        3, self.config.vision_config.image_size,
+                        self.config.vision_config.image_size
+                ]:
+                    raise ValueError(
+                        "Expected pixel value tensor in shape of: (batch size, "
+                        f"patch number, 3, {height}, {width}), got {data.shape}"
+                    )
+
+            # 4d image tensors have different number of patches,
+            # so data is each individual tensor.
+            elif dim == 4:
+                if list(data.shape)[1:] != [
+                        3, self.config.vision_config.image_size,
+                        self.config.vision_config.image_size
+                ]:
+                    raise ValueError(
+                        "Expected pixel value tensor in shape of: (patch "
+                        f"number, 3, {height}, {width}), got {data.shape}")
+            else:
+                raise ValueError(
+                    f"Invalid pixel value tensor of shape {data.shape}")
+
+        if isinstance(data, torch.Tensor):
+            _validate_shape(data)
+        else:
+            [_validate_shape(d) for d in data]
+
+        return data
+
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaNextImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -262,7 +303,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
         return LlavaNextImagePixelInputs(
             type="pixel_values",
-            data=pixel_values,
+            data=self._validate_pixel_values(pixel_values),
             image_sizes=self._validate_image_sizes(image_sizes),
         )
 
@@ -454,7 +495,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
             inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
-                self.vlm_config.image_token_id)
+                self.config.image_token_index)
 
             input_ids = None
         else:
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 3d247c9ed..9f12a8b2b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 import re
 from functools import lru_cache
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import numpy as np
 import torch
@@ -24,7 +24,7 @@ from PIL import Image
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -50,6 +50,9 @@ _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }
 
+# Cannot find the following 2 numbers from hf config.
+_IMAGE_TOKEN_ID = 32044
+
 CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
                                                      hidden_act="quick_gelu",
                                                      hidden_size=1024,
@@ -95,13 +98,10 @@ class Phi3ImageEmbeddingBase(nn.Module):
 class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
     """Phi3 Image embedding with HD transform."""
 
-    def __init__(self,
-                 vision_language_config: VisionLanguageConfig,
-                 config: PretrainedConfig,
-                 wte=None) -> None:
+    def __init__(self, config: PretrainedConfig, wte=None) -> None:
         super().__init__(wte)
 
-        self.image_token_id = vision_language_config.image_token_id
+        self.image_token_id = _IMAGE_TOKEN_ID
         # n_embed or hidden_size
         hidden_size = config.n_embd if hasattr(
             config, 'n_embd') else config.hidden_size
@@ -333,7 +333,7 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
     seq_data = dummy_seq_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         seq_len,
-        image_token_id=32044,
+        image_token_id=_IMAGE_TOKEN_ID,
         image_feature_size_override=image_feature_size,
     )
     mm_data = dummy_image_for_clip(
@@ -370,7 +370,6 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
         return llm_inputs
 
     model_config = ctx.model_config
-    multimodal_config = ctx.get_multimodal_config()
     hf_config = ctx.get_hf_config(PretrainedConfig)
 
     image_data = multi_modal_data["image"]
@@ -407,7 +406,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
     new_token_ids: List[int] = []
     for i in range(len(prompt_token_ids) - len(image_1_token_ids) + 1):
         if prompt_token_ids[i:i + len(image_1_token_ids)] == image_1_token_ids:
-            new_token_ids.append(multimodal_config.image_token_id)
+            new_token_ids.append(_IMAGE_TOKEN_ID)
 
             # No need to further scan the list since we only replace once
             new_token_ids.extend(prompt_token_ids[i + len(image_1_token_ids):])
@@ -424,7 +423,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
         model_config,
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         llm_inputs,
-        image_token_id=multimodal_config.image_token_id,
+        image_token_id=_IMAGE_TOKEN_ID,
         image_feature_size_override=image_feature_size,
     )
 
@@ -436,25 +435,53 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: PretrainedConfig,
-                 vlm_config: VisionLanguageConfig,
+                 multimodal_config: MultiModalConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__()
 
         self.config = config
-        self.vlm_config = vlm_config
+        self.multimodal_config = multimodal_config
 
         self.model = LlamaModel(config, cache_config, quant_config)
 
         # TODO: Optionally initializes this for supporting embeddings.
         self.vision_embed_tokens = Phi3HDImageEmbedding(
-            vlm_config, config, self.model.embed_tokens)
+            config, self.model.embed_tokens)
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        if list(data.shape[1:]) != [2]:
+            raise ValueError(
+                f"The expected image sizes shape is batch dimension plus "
+                f"{[2]}. You supplied {data.shape}.")
+
+        return data
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        def _validate_shape(data: torch.Tensor):
+            if list(data.shape)[2:] != [
+                    3, CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size,
+                    CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size
+            ]:
+                raise ValueError(
+                    "The expected pixel value tensor shape is batch dimension "
+                    "plus patch number, channel, height and width.")
+
+        if isinstance(data, torch.Tensor):
+            _validate_shape(data)
+        else:
+            [_validate_shape(d) for d in data]
+
+        return data
+
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Phi3VImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -471,9 +498,10 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
             raise ValueError("Incorrect type of image sizes. "
                              f"Got type: {type(image_sizes)}")
 
-        return Phi3VImagePixelInputs(type="pixel_values",
-                                     data=pixel_values,
-                                     image_sizes=image_sizes)
+        return Phi3VImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+            image_sizes=self._validate_image_sizes(image_sizes))
 
     def forward(self,
                 input_ids: torch.Tensor,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index f17b04149..bd4583ef5 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -120,3 +120,10 @@ class MultiModalRegistry:
         Create an input mapper (see :meth:`map_input`) for a specific model.
         """
         return functools.partial(self.map_input, model_config)
+
+    def get_num_input_tokens(self):
+        """
+        Get the number of input tokens for profiling purposes.
+        """
+        # TODO: Provide this number on a per model basis.
+        return 3000
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 1c7b8c07e..6a2cfc819 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -3,8 +3,8 @@ from typing import List, Optional
 import torch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
@@ -47,7 +47,7 @@ class TP1DraftModelRunner(ModelRunner):
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         return_hidden_states: bool = False,
     ):
         if return_hidden_states:
@@ -65,7 +65,7 @@ class TP1DraftModelRunner(ModelRunner):
             lora_config=lora_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
-            vision_language_config=vision_language_config,
+            multimodal_config=multimodal_config,
             return_hidden_states=return_hidden_states,
         )
 
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index d8397ac22..b4277ae82 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -7,8 +7,8 @@ from torch import nn
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
@@ -79,7 +79,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
         cache_config: CacheConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         *args,
@@ -93,7 +93,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
         self.device_config = device_config
         self.cache_config = cache_config
         self.lora_config = lora_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
@@ -120,15 +120,14 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
         self.model: nn.Module  # Set after init_Model
 
     def load_model(self) -> None:
-        self.model = get_model(
-            model_config=self.model_config,
-            load_config=self.load_config,
-            device_config=self.device_config,
-            vision_language_config=self.vision_language_config,
-            lora_config=self.lora_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            cache_config=self.cache_config)
+        self.model = get_model(model_config=self.model_config,
+                               load_config=self.load_config,
+                               device_config=self.device_config,
+                               multimodal_config=self.multimodal_config,
+                               lora_config=self.lora_config,
+                               parallel_config=self.parallel_config,
+                               scheduler_config=self.scheduler_config,
+                               cache_config=self.cache_config)
 
     def _prepare_prompt(
         self,
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 8089abd69..92279753d 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -6,8 +6,8 @@ import torch.distributed
 
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -131,7 +131,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
     ) -> None:
@@ -145,7 +145,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -162,7 +162,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
             cache_config,
             load_config=self.load_config,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index d3a2643cb..a3b31a1c0 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -4,8 +4,8 @@ from typing import Any, Dict, List, Optional, Tuple, Type
 import torch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.pooling_params import PoolingParams
@@ -40,7 +40,7 @@ class EmbeddingModelRunner(
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
     ):
         super().__init__(model_config,
                          parallel_config,
@@ -51,7 +51,7 @@ class EmbeddingModelRunner(
                          lora_config=lora_config,
                          kv_cache_dtype=kv_cache_dtype,
                          is_driver_worker=is_driver_worker,
-                         vision_language_config=vision_language_config)
+                         multimodal_config=multimodal_config)
 
     @torch.inference_mode()
     def execute_model(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 530c631d5..02927c3ca 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -24,8 +24,8 @@ except ImportError:
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.inputs import INPUT_REGISTRY
@@ -36,7 +36,8 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.model_executor.models.interfaces import supports_lora
+from vllm.model_executor.models.interfaces import (supports_lora,
+                                                   supports_vision)
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
                              MultiModalInputs)
 from vllm.sampling_params import SamplingParams
@@ -171,7 +172,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         return_hidden_states: bool = False,
     ):
         self.model_config = model_config
@@ -182,7 +183,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         self.lora_config = lora_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.return_hidden_states = return_hidden_states
 
         self.device = self.device_config.device
@@ -244,7 +245,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 device_config=self.device_config,
                 load_config=self.load_config,
                 lora_config=self.lora_config,
-                vision_language_config=self.vision_language_config,
+                multimodal_config=self.multimodal_config,
                 parallel_config=self.parallel_config,
                 scheduler_config=self.scheduler_config,
                 cache_config=self.cache_config,
@@ -256,6 +257,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
         if self.lora_config:
             assert supports_lora(self.model), "Model does not support LoRA"
+            assert not supports_vision(
+                self.model
+            ), "To be tested: vision language model with LoRA settings."
 
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
@@ -804,12 +808,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
         model_config = self.model_config
-        vlm_config = self.vision_language_config
 
-        if vlm_config:
-            max_num_seqs = min(
-                max_num_seqs,
-                int(max_num_batched_tokens / vlm_config.image_feature_size))
+        if supports_vision(self.model):
+            max_num_seqs = max(
+                1,
+                min(
+                    max_num_seqs,
+                    int(max_num_batched_tokens /
+                        MULTIMODAL_REGISTRY.get_num_input_tokens())))
         batch_size = 0
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index f06404888..6281cec09 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -7,8 +7,8 @@ from torch import nn
 from vllm.attention import get_attn_backend
 from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.openvino import get_model
@@ -48,7 +48,7 @@ class OpenVINOModelRunner:
         cache_config: CacheConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         *args,
@@ -60,7 +60,7 @@ class OpenVINOModelRunner:
         self.device_config = device_config
         self.cache_config = cache_config
         self.lora_config = lora_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 7a462ce5d..8ac6f1704 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -7,8 +7,8 @@ import torch.distributed
 
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
                               init_distributed_environment)
@@ -148,7 +148,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
         is_driver_worker: bool = False,
     ) -> None:
@@ -162,7 +162,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -180,7 +180,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
             cache_config,
             load_config=self.load_config,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
         )
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 4ea8e62cc..e4a96c073 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -8,7 +8,7 @@ import torch_xla.core.xla_model as xm
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+                         MultiModalConfig, ParallelConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -39,7 +39,7 @@ class TPUModelRunner:
         device_config: DeviceConfig,
         cache_config: CacheConfig,
         load_config: LoadConfig,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         is_driver_worker: bool = False,
     ):
         self.model_config = model_config
@@ -48,7 +48,7 @@ class TPUModelRunner:
         self.device_config = device_config
         self.cache_config = cache_config
         self.load_config = load_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
 
         self.block_size = self.cache_config.block_size
@@ -82,7 +82,7 @@ class TPUModelRunner:
             parallel_config=self.parallel_config,
             cache_config=self.cache_config,
             scheduler_config=self.scheduler_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             lora_config=None,
         )
         xm.wait_device_ops()
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index d58c7dc99..30725473a 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -8,7 +8,7 @@ import torch_xla.runtime as xr
 
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+                         MultiModalConfig, ParallelConfig, SchedulerConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -31,7 +31,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         device_config: DeviceConfig,
         cache_config: CacheConfig,
         load_config: LoadConfig,
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         local_rank: int,
         rank: int,
         distributed_init_method: str,
@@ -43,7 +43,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
         self.device_config = device_config
         self.cache_config = cache_config
         self.load_config = load_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
@@ -62,7 +62,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
                                            device_config,
                                            cache_config,
                                            load_config,
-                                           vision_language_config,
+                                           multimodal_config,
                                            is_driver_worker=is_driver_worker)
 
     def init_device(self) -> None:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index b25f29f48..26a176be4 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -7,8 +7,8 @@ import torch
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
@@ -43,7 +43,7 @@ class Worker(LocalOrDistributedWorkerBase):
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
         is_driver_worker: bool = False,
         model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
@@ -66,10 +66,7 @@ class Worker(LocalOrDistributedWorkerBase):
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.vision_language_config = vision_language_config
-        if self.vision_language_config:
-            assert not self.lora_config, (
-                "To be tested: vision language model with LoRA settings.")
+        self.multimodal_config = multimodal_config
 
         # Return hidden states from target model if the draft model is an
         # mlp_speculator
@@ -94,7 +91,7 @@ class Worker(LocalOrDistributedWorkerBase):
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            vision_language_config=vision_language_config,
+            multimodal_config=multimodal_config,
             **speculative_args,
         )
         # Uninitialized cache engine. Will be initialized by
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index f4fc42328..c3a24c89f 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -7,12 +7,13 @@ import torch.nn as nn
 
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.distributed import broadcast_tensor_dict
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models.interfaces import supports_vision
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
                              MultiModalInputs)
 from vllm.sampling_params import SamplingParams
@@ -85,7 +86,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         cache_config: CacheConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         *args,
@@ -97,7 +98,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         self.lora_config = lora_config
         self.load_config = load_config
         self.cache_config = cache_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
 
         self.sliding_window = model_config.get_sliding_window()
@@ -134,7 +135,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
                 device_config=self.device_config,
                 load_config=self.load_config,
                 lora_config=self.lora_config,
-                vision_language_config=self.vision_language_config,
+                multimodal_config=self.multimodal_config,
                 parallel_config=self.parallel_config,
                 scheduler_config=self.scheduler_config,
                 cache_config=self.cache_config,
@@ -165,12 +166,16 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
         model_config = self.model_config
-        vlm_config = self.vision_language_config
 
-        if vlm_config:
-            max_num_seqs = min(
-                max_num_seqs,
-                int(max_num_batched_tokens / vlm_config.image_feature_size))
+        if supports_vision(self.model):
+            # TODO: properly inject these numbers from MultiModalRegistry.
+            # Right now, just use an overly conservative number.
+            max_num_seqs = max(
+                1,
+                min(
+                    max_num_seqs,
+                    int(max_num_batched_tokens /
+                        MULTIMODAL_REGISTRY.get_num_input_tokens())))
 
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 7a51f2b2c..a946eb624 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -9,8 +9,8 @@ import torch
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -45,7 +45,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
@@ -66,10 +66,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
-        self.vision_language_config = vision_language_config
-        if self.vision_language_config:
-            assert not self.lora_config, (
-                "To be tested: vision language model with LoRA settings.")
+        self.multimodal_config = multimodal_config
 
         self.model_runner = XPUModelRunner(  # type: ignore
             model_config,
@@ -81,7 +78,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            vision_language_config=vision_language_config,
+            multimodal_config=multimodal_config,
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-- 
GitLab


From 62963d129e84d0a0904ee62dbab067a29216e7bf Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 3 Jul 2024 18:50:08 -0400
Subject: [PATCH 255/376] [ Misc ] Clean Up `CompressedTensorsW8A8` (#6113)

---
 tests/quantization/test_compressed_tensors.py |  9 ++--
 .../compressed_tensors/compressed_tensors.py  | 11 ++---
 .../compressed_tensors/schemes/__init__.py    |  5 +-
 .../schemes/compressed_tensors_w8a8.py        | 34 +++++++++++++-
 .../compressed_tensors_w8a8_dynamictoken.py   | 33 -------------
 .../compressed_tensors_w8a8_statictensor.py   | 47 -------------------
 6 files changed, 44 insertions(+), 95 deletions(-)
 delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
 delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index d5472f97a..4cdda97dc 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -9,8 +9,7 @@ import torch
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
-    CompressedTensorsWNA16)
+    CompressedTensorsW8A8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     QuantizationType)
 
@@ -38,9 +37,10 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
                           CompressedTensorsLinearMethod)
         assert isinstance(down_proj.quant_method,
                           CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8)
 
         assert qkv_proj.scheme.strategy == strategy
+        assert qkv_proj.scheme.is_static_input_scheme
         expected_type = (torch.int8 if quant_type == QuantizationType.INT else
                          torch.float8_e4m3fn)
 
@@ -79,7 +79,8 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
         qkv_proj = layer.self_attn.qkv_proj
 
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8)
+        assert not qkv_proj.scheme.is_static_input_scheme
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index e88bbc361..8ca486d95 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -9,8 +9,7 @@ from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor,
-    CompressedTensorsWNA16)
+    CompressedTensorsW8A8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     find_first_name_or_class_match)
@@ -150,12 +149,12 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         if self.quant_format == CompressionFormat.int_quantized.value:
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8StaticTensor(
-                    strategy=weight_quant.strategy)
+                return CompressedTensorsW8A8(strategy=weight_quant.strategy,
+                                             is_static_input_scheme=True)
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8DynamicToken(
-                    strategy=weight_quant.strategy)
+                return CompressedTensorsW8A8(strategy=weight_quant.strategy,
+                                             is_static_input_scheme=False)
 
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index f6d20ce2c..720b8c263 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -3,9 +3,6 @@ from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
 from .compressed_tensors_w4a16_24 import (  # noqa: F401
     W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24)
-from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
-    CompressedTensorsW8A8DynamicToken)
-from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
-    CompressedTensorsW8A8StaticTensor)
+from .compressed_tensors_w8a8 import CompressedTensorsW8A8  # noqa: F401
 from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS  # noqa: F401
 from .compressed_tensors_wNa16 import CompressedTensorsWNA16  # noqa: F401
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
index 497790576..dffe2a284 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
@@ -3,6 +3,7 @@ from typing import Callable, List, Tuple, Union
 import torch
 from torch.nn import Parameter
 
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
@@ -12,8 +13,9 @@ from vllm.model_executor.utils import set_weight_attrs
 
 class CompressedTensorsW8A8(CompressedTensorsScheme):
 
-    def __init__(self, strategy: str):
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
 
     # Cutlass kernels support only per-tensor and per-channel cases.
     # So if we have a fused module (QKV, MLP) with per tensor scales (thus N
@@ -36,6 +38,10 @@ class CompressedTensorsW8A8(CompressedTensorsScheme):
             layer.weight_scale = Parameter(weight_scale_channel,
                                            requires_grad=False)
 
+        # transpose weights for cutlass.
+        weight = layer.weight
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
@@ -75,3 +81,29 @@ class CompressedTensorsW8A8(CompressedTensorsScheme):
             "output_dim": 0,
             "weight_loader": weight_loader,
         })
+
+        # INPUT SCALE
+        # Static quantization:  load from disk.
+        if self.is_static_input_scheme:
+            input_scale = Parameter(torch.empty(1, dtype=torch.float32),
+                                    requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            set_weight_attrs(input_scale, {
+                "weight_loader": weight_loader,
+                "ignore_warning": True,
+            })
+        # Dynamic quantization: set to None.
+        else:
+            layer.input_scale = None
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        # ops.scaled_int8_quant supports both dynamic and static quant.
+        # * dynamic, layer.input_scale is None and x_scale computed from x.
+        # * static, layer.input_scale is scalar and x_scale is input_scale.
+        x_q, x_scale = ops.scaled_int8_quant(x, layer.input_scale)
+
+        return ops.cutlass_scaled_mm(x_q,
+                                     layer.weight,
+                                     scale_a=x_scale,
+                                     scale_b=layer.weight_scale,
+                                     out_dtype=x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
deleted file mode 100644
index 5fc05b8e6..000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from typing import Callable, List
-
-import torch
-
-from vllm import _custom_ops as custom_ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import (  # noqa: E501
-    CompressedTensorsW8A8)
-
-__all__ = ["CompressedTensorsW8A8DynamicToken"]
-
-
-class CompressedTensorsW8A8DynamicToken(CompressedTensorsW8A8):
-
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
-        super().create_weights(
-            layer=layer,
-            output_partition_sizes=output_partition_sizes,
-            input_size_per_partition=input_size_per_partition,
-            params_dtype=params_dtype,
-            weight_loader=weight_loader)
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        weight = layer.weight
-        weight_scale = layer.weight_scale
-
-        x_q, input_scales = custom_ops.scaled_int8_quant(x)
-        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), input_scales,
-                                            weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
deleted file mode 100644
index 79f5358a3..000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from typing import Callable, List
-
-import torch
-from torch.nn import Parameter
-
-from vllm import _custom_ops as custom_ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8 import (  # noqa: E501
-    CompressedTensorsW8A8)
-from vllm.model_executor.utils import set_weight_attrs
-
-__all__ = ["CompressedTensorsW8A8StaticTensor"]
-
-
-class CompressedTensorsW8A8StaticTensor(CompressedTensorsW8A8):
-
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
-        super().create_weights(
-            layer=layer,
-            output_partition_sizes=output_partition_sizes,
-            input_size_per_partition=input_size_per_partition,
-            params_dtype=params_dtype,
-            weight_loader=weight_loader)
-
-        input_scale = Parameter(torch.empty(1, dtype=torch.float32),
-                                requires_grad=False)
-
-        layer.register_parameter("input_scale", input_scale)
-        set_weight_attrs(input_scale, {
-            "weight_loader": weight_loader,
-            "ignore_warning": True,
-        })
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        weight = layer.weight
-        weight_scale = layer.weight_scale
-        act_scale = layer.input_scale
-
-        # Input quantize
-        x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)
-
-        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), act_scale,
-                                            weight_scale, x.dtype)
-- 
GitLab


From 966fe72141e8365721840b7ababfb78601c23ead Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 3 Jul 2024 15:52:04 -0700
Subject: [PATCH 256/376] [doc][misc] bump up py version in installation doc
 (#6119)

---
 docs/source/getting_started/installation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index ba23e7468..d458b0235 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -20,7 +20,7 @@ You can install vLLM using pip:
 .. code-block:: console
 
     $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.9 -y
+    $ conda create -n myenv python=3.10 -y
     $ conda activate myenv
 
     $ # Install vLLM with CUDA 12.1.
@@ -35,7 +35,7 @@ You can install vLLM using pip:
 
         $ # Install vLLM with CUDA 11.8.
         $ export VLLM_VERSION=0.4.0
-        $ export PYTHON_VERSION=39
+        $ export PYTHON_VERSION=310
         $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 
     In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-- 
GitLab


From 3de6e6a30e33406c2b3bb81fb5a82a2966cebd87 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 3 Jul 2024 16:40:31 -0700
Subject: [PATCH 257/376] [core][distributed] support n layers % pp size != 0
 (#6115)

---
 .buildkite/test-pipeline.yaml  |  1 +
 vllm/config.py                 | 15 ++++++---------
 vllm/distributed/utils.py      |  9 ++++++++-
 vllm/worker/openvino_worker.py |  1 +
 vllm/worker/tpu_worker.py      |  1 +
 vllm/worker/worker.py          |  1 +
 vllm/worker/xpu_worker.py      |  1 +
 7 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d127278aa..3680bfdde 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -80,6 +80,7 @@ steps:
   commands:
   - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
   - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+  - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
   - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
   - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
 
diff --git a/vllm/config.py b/vllm/config.py
index 0004622ce..1ea288879 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -265,8 +265,6 @@ class ModelConfig:
                 " must be divisible by tensor parallel size "
                 f"({tensor_parallel_size}).")
 
-        total_num_hidden_layers = getattr(self.hf_text_config,
-                                          "num_hidden_layers", 0)
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
         architectures = getattr(self.hf_config, "architectures", [])
         if not all(arch in _PP_SUPPORTED_MODELS
@@ -275,12 +273,6 @@ class ModelConfig:
                 "Pipeline parallelism is only supported for the following "
                 f" architectures: {_PP_SUPPORTED_MODELS}.")
 
-        if total_num_hidden_layers % pipeline_parallel_size != 0:
-            raise ValueError(
-                f"Total number of hidden layers ({total_num_hidden_layers}) "
-                "must be divisible by pipeline parallel size "
-                f"({pipeline_parallel_size}).")
-
         if self.quantization == "bitsandbytes" and (
                 parallel_config.tensor_parallel_size > 1
                 or parallel_config.pipeline_parallel_size > 1):
@@ -385,9 +377,13 @@ class ModelConfig:
         return num_heads // parallel_config.tensor_parallel_size
 
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+        from vllm.distributed.utils import get_pp_indices
         total_num_hidden_layers = getattr(self.hf_text_config,
                                           "num_hidden_layers", 0)
-        return total_num_hidden_layers // parallel_config.pipeline_parallel_size
+        pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
+        pp_size = parallel_config.pipeline_parallel_size
+        start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
+        return end - start
 
     def contains_seqlen_agnostic_layers(
             self, parallel_config: "ParallelConfig") -> bool:
@@ -709,6 +705,7 @@ class ParallelConfig:
                 {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
 
         self._verify_args()
+        self.rank = 0
 
     def _verify_args(self) -> None:
         if (self.pipeline_parallel_size > 1
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 4e4206e58..b5cf6c45f 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -50,8 +50,15 @@ def split_tensor_along_last_dim(
 
 def get_pp_indices(num_hidden_layers: int, pp_rank: int,
                    pp_size: int) -> Tuple[int, int]:
-    layers_per_partition = divide(num_hidden_layers, pp_size)
+    """Try to evenly distribute layers across partitions.
+    If the number of layers is not divisible by the number of partitions,
+    the last partition will have the remaining layers.
+    """
+    layers_per_partition = num_hidden_layers // pp_size
     start_layer = pp_rank * layers_per_partition
     end_layer = start_layer + layers_per_partition
 
+    if pp_rank == pp_size - 1:
+        end_layer = num_hidden_layers
+
     return (start_layer, end_layer)
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 8ac6f1704..c47f9acc4 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -154,6 +154,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 30725473a..60fee9892 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -39,6 +39,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 26a176be4..58707269b 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -50,6 +50,7 @@ class Worker(LocalOrDistributedWorkerBase):
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index a946eb624..94dfcfec3 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -54,6 +54,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
 
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
-- 
GitLab


From 1dab9bc8a9192a6081821c3a6b6c4aee3b7912c3 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 3 Jul 2024 17:56:59 -0600
Subject: [PATCH 258/376] [Bugfix] set OMP_NUM_THREADS to 1 by default for
 multiprocessing (#6109)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
---
 vllm/executor/multiproc_gpu_executor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index ae5062bd6..dcde27973 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -37,6 +37,11 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
+        # contention amongst the shards
+        if "OMP_NUM_THREADS" not in os.environ:
+            os.environ["OMP_NUM_THREADS"] = "1"
+
         assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")
 
-- 
GitLab


From 0ed646b7aa3b434b2fcb6f6b6e725570879cb89e Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Wed, 3 Jul 2024 17:52:29 -0700
Subject: [PATCH 259/376] [Distributed][Core] Support Py39 and Py38 for PP
 (#6120)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
---
 vllm/executor/executor_base.py    | 7 +------
 vllm/executor/ray_gpu_executor.py | 9 +++++++++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 2abb29c14..fc18dec0b 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -123,12 +123,7 @@ class ExecutorAsyncBase(ExecutorBase):
         multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
-        # This locks each pipeline parallel stage so multiple virtual engines
-        # can't execute on the same stage at the same time
-        self.pp_locks = [
-            asyncio.Lock()
-            for _ in range(parallel_config.pipeline_parallel_size)
-        ]
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
 
         super().__init__(model_config, cache_config, parallel_config,
                          scheduler_config, device_config, load_config,
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index e0b9441a9..bc7ef9cc7 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -349,6 +349,15 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
     ) -> List[SamplerOutput]:
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
 
         async def _run_task_with_lock(task, lock, *args, **kwargs):
             async with lock:
-- 
GitLab


From 3dd507083f4d8416d5fed9827e91d22f29b0b723 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 4 Jul 2024 09:58:18 +0800
Subject: [PATCH 260/376] [CI/Build] Cleanup VLM tests (#6107)

---
 tests/models/test_llava_next.py | 7 ++-----
 tests/models/test_phi3v.py      | 3 +--
 tests/models/utils.py           | 1 +
 vllm/multimodal/image.py        | 2 +-
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index bf911b5c6..581cbcf90 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,4 +1,3 @@
-import re
 from typing import List, Optional, Tuple
 
 import pytest
@@ -36,7 +35,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     output_ids, output_str, out_logprobs = vllm_output
 
     tokenizer = AutoTokenizer.from_pretrained(model)
-    image_token_str = tokenizer.decode(IMAGE_TOKEN_ID)
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
@@ -44,9 +42,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
         if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
     ]
 
-    hf_output_str = re.sub(fr"({image_token_str})+", "", output_str)
-    assert hf_output_str[0] == " "
-    hf_output_str = hf_output_str[1:]
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
     if hf_output_ids[-1] == eos_token_id:
         hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
 
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index cb32a047a..faadab224 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -35,8 +35,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     assert output_str_without_image[0] == " "
     output_str_without_image = output_str_without_image[1:]
 
-    hf_output_str = output_str_without_image.replace("<|user|>", "") \
-        .replace("<|end|>\n<|assistant|>", " ")
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
 
     tokenizer = AutoTokenizer.from_pretrained(model)
     hf_output_ids = tokenizer.encode(output_str_without_image)
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 51d57129d..425f57ef9 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -77,6 +77,7 @@ def check_logprobs_close(
                 # Each predicted token must be in top N logprobs of the other
                 fail_msg = (
                     f"Test{prompt_idx}:"
+                    f"\nMatched tokens:\t{output_ids_0[:idx]}"
                     f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
                     f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index dfef33121..27010fa6e 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -115,7 +115,7 @@ class ImagePlugin(MultiModalPlugin):
         if isinstance(data, Image.Image):
             image_processor = self._get_hf_image_processor(model_config)
             if image_processor is None:
-                raise RuntimeError("No HuggingFace processor is available"
+                raise RuntimeError("No HuggingFace processor is available "
                                    "to process the image object")
             try:
                 batch_data = image_processor \
-- 
GitLab


From 56b325e977435af744f8b3dca7af0ca209663558 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 4 Jul 2024 01:19:38 -0400
Subject: [PATCH 261/376] [ROCm][AMD][Model]Adding alibi slopes support in ROCm
 triton flash attention and naive flash attention (#6043)

Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 53 +++++++++++++++++++++-
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 81fabdbdf..31ae07514 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -166,6 +166,37 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
         return self._cached_decode_metadata
 
 
+def _make_alibi_bias(alibi_slopes: torch.Tensor,
+                     dtype: torch.dtype,
+                     seq_lens: Optional[List[int]],
+                     make_attn_mask: bool = True) -> List[torch.Tensor]:
+    attn_biases = []
+    if seq_lens:
+        for seq_len in seq_lens:
+            bias = torch.arange(seq_len, dtype=dtype)
+            # NOTE(zhuohan): HF uses
+            #     `bias = bias[None, :].repeat(seq_len, 1)`
+            # here. We find that both biases give the same results, but
+            # the bias below more accurately follows the original ALiBi
+            # paper.
+            bias = bias[None, :] - bias[:, None]
+
+            num_heads = alibi_slopes.shape[0]
+            bias = bias[None, :].repeat(
+                (num_heads, 1, 1)).to(alibi_slopes.device)
+            bias.mul_(alibi_slopes[:, None, None])
+            if make_attn_mask:
+                inf_mask = torch.empty(
+                    (1, seq_len, seq_len),
+                    dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1).to(
+                        alibi_slopes.device)
+                attn_biases.append((bias + inf_mask).to(dtype))
+            else:
+                attn_biases.append(bias.to(dtype))
+
+    return attn_biases
+
+
 class ROCmFlashAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -324,7 +355,14 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
+                attn_masks = None
                 if self.use_triton_flash_attn:
+                    if self.alibi_slopes is not None:
+                        attn_masks = _make_alibi_bias(
+                            self.alibi_slopes,
+                            query.dtype,
+                            attn_metadata.seq_lens,
+                            make_attn_mask=False)  # type: ignore
                     out, _ = self.attn_func(
                         query,
                         key,
@@ -336,12 +374,20 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         prefill_meta.max_prefill_seq_len,
                         True,
                         self.scale,
+                        attn_masks[0][None]
+                        if attn_masks is not None else None,
                     )
                 elif self.use_naive_attn:
                     if self.num_kv_heads != self.num_heads:
                         # Interleave for MQA workaround.
                         key = self.repeat_kv(key, self.num_queries_per_kv)
                         value = self.repeat_kv(value, self.num_queries_per_kv)
+                    if self.alibi_slopes is not None:
+                        attn_masks = _make_alibi_bias(
+                            self.alibi_slopes,
+                            query.dtype,
+                            attn_metadata.seq_lens,
+                            make_attn_mask=True)  # type: ignore
                     query = query.movedim(0, query.dim() - 2)
                     key = key.movedim(0, key.dim() - 2)
                     value = value.movedim(0, value.dim() - 2)
@@ -355,6 +401,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         self.num_heads,
                         self.head_size,
                         self.scale,
+                        attn_masks,
                     )
                 else:
                     out = self.attn_func(
@@ -418,13 +465,14 @@ def _sdpa_attention(
     num_heads: int,
     head_size: int,
     scale: float,
+    attn_masks: Optional[List[torch.Tensor]] = None,
 ) -> torch.Tensor:
     start = 0
     output = torch.empty((num_tokens, num_heads, head_size),
                          dtype=query.dtype,
                          device=query.device)
 
-    for seq_len in seq_lens:
+    for i, seq_len in enumerate(seq_lens):
         end = start + seq_len
         with torch.backends.cuda.sdp_kernel(enable_math=True,
                                             enable_flash=False,
@@ -434,7 +482,8 @@ def _sdpa_attention(
                 key[:, start:end, :],
                 value[:, start:end, :],
                 dropout_p=0.0,
-                is_causal=True,
+                is_causal=attn_masks is None,
+                attn_mask=attn_masks[i] if attn_masks else None,
                 scale=scale).movedim(query.dim() - 2, 0)
             output[start:end, :, :] = sub_out
             start = end
-- 
GitLab


From 27902d42beeeb5828ef3243d5455a3b9af3317b3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 4 Jul 2024 09:57:09 -0700
Subject: [PATCH 262/376] [misc][doc] try to add warning for latest html
 (#5979)

---
 docs/source/_templates/sections/header.html | 38 +++++++++++++++++++++
 docs/source/conf.py                         |  8 +++++
 2 files changed, 46 insertions(+)
 create mode 100644 docs/source/_templates/sections/header.html

diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html
new file mode 100644
index 000000000..cd5c4053e
--- /dev/null
+++ b/docs/source/_templates/sections/header.html
@@ -0,0 +1,38 @@
+<style>
+  .notification-bar {
+    width: 100vw;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    font-size: 16px;
+  }
+  .notification-bar p {
+    margin: 0;
+  }
+  .notification-bar a {
+    font-weight: bold;
+    text-decoration: none;
+  }
+
+  /* Light mode styles (default) */
+  .notification-bar {
+    background-color: #fff3cd;
+    color: #856404;
+  }
+  .notification-bar a {
+    color: #d97706;
+  }
+
+  /* Dark mode styles */
+  html[data-theme=dark] .notification-bar {
+    background-color: #333;
+    color: #ddd;
+  }
+  html[data-theme=dark] .notification-bar a {
+    color: #ffa500; /* Brighter color for visibility */
+  }
+</style>
+
+<div class="notification-bar">
+  <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
+</div>
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7c5bb8f79..46a3bcbf1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -69,6 +69,14 @@ html_theme_options = {
     'use_edit_page_button': True,
 }
 
+# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
+READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
+if READTHEDOCS_VERSION_TYPE == "tag":
+    # remove the warning banner if the version is a tagged release
+    header_file = os.path.join(os.path.dirname(__file__),
+                               "_templates/sections/header.html")
+    os.remove(header_file)
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-- 
GitLab


From 81d7a50f2402ef5b622ac8d3a081994a1a4641b0 Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Fri, 5 Jul 2024 06:22:12 +0800
Subject: [PATCH 263/376] [Hardware][Intel CPU] Adding intel openmp tunings in
 Docker file (#6008)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh |  6 ++++--
 Dockerfile.cpu             | 10 ++++++++--
 vllm/utils.py              | 21 +++++++++++++++++++++
 vllm/worker/cpu_worker.py  |  5 ++++-
 4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 9d4b2bb1c..a7678aae5 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -12,8 +12,10 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 6e55203de..f95d748f1 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -6,7 +6,13 @@ RUN apt-get update  -y \
     && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
-RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
+# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+# intel-openmp provides additional performance improvement vs. openmp
+# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
+RUN pip install intel-openmp
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
+
 
 RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
 
@@ -31,4 +37,4 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
-CMD ["/bin/bash"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/vllm/utils.py b/vllm/utils.py
index 854decc29..a3d15d797 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -398,6 +398,27 @@ def update_environment_variables(envs: Dict[str, str]):
         os.environ[k] = v
 
 
+def init_kmp_env():
+    if not is_cpu():
+        return
+
+    ld_prealod_str = os.getenv("LD_PRELOAD", "")
+    if "libiomp5.so" not in ld_prealod_str:
+        return
+
+    # The time(milliseconds) that a thread should wait after completing the
+    # execution of a parallel region, before sleeping.
+    os.environ['KMP_BLOCKTIME'] = "1"
+    # dump settings on start up
+    os.environ['KMP_SETTINGS'] = "1"
+    # Prevents the CPU to run into low performance state
+    os.environ['KMP_TPAUSE'] = "0"
+    # Provides fine granularity parallelism
+    os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+    os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+    os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+
 def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
     """Yield successive chunk_size chunks from lst."""
     return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 92279753d..657505739 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -13,7 +13,7 @@ from vllm.distributed import (ensure_model_parallel_initialized,
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, init_kmp_env
 from vllm.worker.cpu_model_runner import CPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerInput)
@@ -150,6 +150,9 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
+        # try to initialize intel openmp optimized tunings
+        init_kmp_env()
+
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
-- 
GitLab


From 69ec3ca14cf3d0b278672196ad6de9875fe95cbb Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Thu, 4 Jul 2024 16:35:51 -0700
Subject: [PATCH 264/376] [Kernel][Model] logits_soft_cap for Gemma2 with
 flashinfer (#6051)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .buildkite/test-pipeline.yaml         |   7 +-
 tests/kernels/test_flashinfer.py      | 248 ++++++++++++++++++++++++++
 vllm/attention/backends/flashinfer.py |  12 +-
 vllm/attention/selector.py            |   6 +-
 vllm/model_executor/models/gemma2.py  |   7 -
 vllm/worker/model_runner.py           |  19 +-
 6 files changed, 279 insertions(+), 20 deletions(-)
 create mode 100644 tests/kernels/test_flashinfer.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3680bfdde..8013fbb64 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -118,12 +118,15 @@ steps:
 
 - label: Kernels Test %N
   #mirror_hardwares: [amd]
-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test
@@ -234,7 +237,7 @@ steps:
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
new file mode 100644
index 000000000..5211be6ae
--- /dev/null
+++ b/tests/kernels/test_flashinfer.py
@@ -0,0 +1,248 @@
+from typing import List, Optional, Tuple
+
+import flashinfer
+import pytest
+import torch
+
+NUM_HEADS = [(16, 16), (32, 8), (64, 8)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16, 32]
+DTYPES = [torch.float16, torch.bfloat16]
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: List[int],
+    kv_lens: List[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: List[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx:start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = torch.triu(empty_mask,
+                                             diagonal=kv_len -
+                                             (query_len + sliding_window) +
+                                             1).bool().logical_not()
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@torch.inference_mode
+def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
+                                         num_heads: Tuple[int,
+                                                          int], head_size: int,
+                                         dtype: torch.dtype, block_size: int,
+                                         soft_cap: Optional[float]) -> None:
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    key_value_cache = torch.randn(NUM_BLOCKS,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.\
+        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD")
+    wrapper.begin_forward(kv_indptr,
+                          kv_indices,
+                          kv_last_page_lens,
+                          num_query_heads,
+                          num_kv_heads,
+                          head_size,
+                          block_size,
+                          "NONE",
+                          data_type=dtype)
+
+    output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=[1] * num_seqs,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap)
+    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
+
+
+@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@torch.inference_mode
+def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
+                                          num_heads: Tuple[int, int],
+                                          head_size: int, dtype: torch.dtype,
+                                          block_size: int,
+                                          soft_cap: Optional[float]) -> None:
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    key_value_cache = torch.randn(NUM_BLOCKS,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    # Normalize the scale of the key and value caches to mitigate
+    # numerical instability.
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    qo_indptr = [0]
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+        qo_indptr.append(qo_indptr[-1] + query_lens[i])
+
+    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD")
+    wrapper.begin_forward(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+    )
+
+    output = wrapper.forward(
+        query,
+        key_value_cache,
+        logits_soft_cap=soft_cap,
+    )
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=query_lens,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap)
+    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 4d023282f..a9ab23130 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -102,6 +102,8 @@ class FlashInferMetadata(AttentionMetadata):
     # The data type of the paged kv cache
     data_type: torch.dtype = None
     device: torch.device = torch.device("cuda")
+    # Only used by gemma2 model
+    logits_soft_cap: Optional[float] = None
 
     def __post_init__(self):
         # Refer to
@@ -271,9 +273,11 @@ class FlashInferImpl(AttentionImpl):
             else:
                 assert prefill_meta is not None
                 assert prefill_meta.prefill_wrapper is not None
-                output = prefill_meta.prefill_wrapper.forward(query,
-                                                              kv_cache,
-                                                              causal=True)
+                output = prefill_meta.prefill_wrapper.forward(
+                    query,
+                    kv_cache,
+                    logits_soft_cap=attn_metadata.logits_soft_cap,
+                    causal=True)
         else:
             assert attn_metadata.decode_metadata is not None
             assert attn_metadata.decode_metadata.decode_wrapper is not None
@@ -281,5 +285,5 @@ class FlashInferImpl(AttentionImpl):
                 query,
                 kv_cache,
                 sm_scale=self.scale,
-            )
+                logits_soft_cap=attn_metadata.logits_soft_cap)
         return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 851bf52a5..ae63eb1d4 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -77,9 +77,9 @@ def get_attn_backend(
         return IpexAttnBackend
     elif backend == _Backend.FLASHINFER:
         logger.info("Using Flashinfer backend.")
-        logger.warning(("Flashinfer will be stuck on llma-2-7b,"
-                        " please avoid using Flashinfer as the"
-                        "backend when running on llma-2-7b."))
+        logger.warning(("Flashinfer will be stuck on llama-2-7b,"
+                        " please avoid using Flashinfer as the "
+                        "backend when running on llama-2-7b."))
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
     elif backend == _Backend.PALLAS:
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 8fedff625..8386084c2 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -38,7 +38,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsLoRA
 
@@ -137,12 +136,6 @@ class Gemma2Attention(nn.Module):
             dtype=torch.get_default_dtype(),
         )
 
-        if self.config.attn_logit_softcapping is not None:
-            print_warning_once(
-                "Gemma 2 normally uses attention logit soft-capping; "
-                "soft-capping is currently incompatible with the flash "
-                "attention kernels, so vLLM removes it to enable speed and "
-                "efficiency gains of flash attention.")
         # FIXME(woosuk): While Gemma 2 uses sliding window attention for every
         # odd layer, vLLM currently ignores it and uses global attention for
         # all layers.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 02927c3ca..2ae5263ba 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -15,7 +15,7 @@ try:
     from flashinfer import BatchDecodeWithPagedKVCacheWrapper
     from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
     from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 except ImportError:
     BatchDecodeWithPagedKVCacheWrapper = None
     CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
@@ -683,6 +683,16 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                                            dtype=torch.long,
                                            device=self.device)
 
+        logits_soft_cap = getattr(self.model_config.hf_config,
+                                  'attn_logit_softcapping', None)
+        if logits_soft_cap is not None and self.attn_backend.get_name(
+        ) != "flashinfer":
+            raise ValueError("Please use Flashinfer backend for models with"
+                             "logits_soft_cap (i.e., Gemma-2)."
+                             " Otherwise, the output might be wrong."
+                             " Set Flashinfer backend by "
+                             "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
+
         if self.attn_backend.get_name() == "flashinfer":
             if len(paged_kv_indptr) > 0:
                 paged_kv_indices_tensor = torch.tensor(paged_kv_indices,
@@ -700,7 +710,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
             kv_cache_dtype = get_kv_cache_torch_dtype(self.kv_cache_dtype,
                                                       self.model_config.dtype)
-
             attn_metadata = self.attn_backend.make_metadata(
                 num_prefills=num_prefills,
                 slot_mapping=slot_mapping_tensor,
@@ -721,7 +730,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 query_start_loc=query_start_loc,
                 device=self.device,
                 data_type=kv_cache_dtype,
-                use_cuda_graph=use_captured_graph)
+                use_cuda_graph=use_captured_graph,
+                logits_soft_cap=logits_soft_cap)
 
         else:
             attn_metadata = self.attn_backend.make_metadata(
@@ -1196,7 +1206,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
             if model_input.attn_metadata.use_cuda_graph:
                 batch_size = model_input.input_tokens.shape[0]
                 model_input.attn_metadata.decode_wrapper = self.graph_runners[
-                    batch_size].flashinfer_decode_wrapper
+                    model_input.
+                    virtual_engine][batch_size].flashinfer_decode_wrapper
             else:
                 model_input.attn_metadata.decode_wrapper = \
                     self.flashinfer_decode_wrapper
-- 
GitLab


From ae96ef8fbd6fe8905f8ea0d3a3f9ff1738dbcbe5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 5 Jul 2024 07:37:23 +0800
Subject: [PATCH 265/376] [VLM] Calculate maximum number of multi-modal tokens
 by model (#6121)

---
 .../multimodal/adding_multimodal_model.rst    |  68 ++++++++----
 docs/source/models/vlm.rst                    |  18 +--
 vllm/inputs/registry.py                       |   2 +-
 vllm/model_executor/models/clip.py            |   4 +
 vllm/model_executor/models/llava.py           |  14 ++-
 vllm/model_executor/models/llava_next.py      |  12 ++
 vllm/model_executor/models/phi3v.py           |  12 ++
 vllm/multimodal/base.py                       | 105 +++++++++++++++++-
 vllm/multimodal/image.py                      |   3 +
 vllm/multimodal/registry.py                   |  73 ++++++------
 vllm/worker/model_runner.py                   |  19 +++-
 vllm/worker/xpu_model_runner.py               |  20 ++--
 12 files changed, 260 insertions(+), 90 deletions(-)

diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
index 0e9590639..32f62003f 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -51,17 +51,16 @@ As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model
 2. Register input mappers
 -------------------------
 
-For each modality type to support, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
 This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
 
 .. code-block:: diff
 
-    from vllm.model_executor.models.interfaces import SupportsVision
+      from vllm.model_executor.models.interfaces import SupportsVision
     + from vllm.multimodal import MULTIMODAL_REGISTRY
 
-    + @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
-    + @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
-    class YourModelForImage2Seq(nn.Module, SupportsVision):
+    + @MULTIMODAL_REGISTRY.register_image_input_mapper()
+      class YourModelForImage2Seq(nn.Module, SupportsVision):
 
 A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
 
@@ -69,7 +68,33 @@ A default mapper is available for each modality in the core vLLM library. This i
     :ref:`input_processing_pipeline`
 
 
-3. (Optional) Register dummy data
+3. Register maximum number of multimodal tokens
+----------------------------------------------------------
+
+For each modality type that the model accepts as input, calculate the maximum possible number of tokens
+and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
+
+.. code-block:: diff
+
+      from vllm.inputs import INPUT_REGISTRY
+      from vllm.model_executor.models.interfaces import SupportsVision
+      from vllm.multimodal import MULTIMODAL_REGISTRY
+
+      @MULTIMODAL_REGISTRY.register_image_input_mapper()
+    + @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+      @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+      class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+Here are some examples:
+
+- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`
+
+
+4. (Optional) Register dummy data
 ---------------------------------
 
 During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
@@ -77,14 +102,17 @@ In such cases, you can define your own dummy data by registering a factory metho
 
 .. code-block:: diff
 
-    from vllm.inputs import INPUT_REGISTRY
-    from vllm.model_executor.models.interfaces import SupportsVision
-    from vllm.multimodal import MULTIMODAL_REGISTRY
+      from vllm.inputs import INPUT_REGISTRY
+      from vllm.model_executor.models.interfaces import SupportsVision
+      from vllm.multimodal import MULTIMODAL_REGISTRY
 
-    @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
-    @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+      @MULTIMODAL_REGISTRY.register_image_input_mapper()
+      @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
     + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-    class YourModelForImage2Seq(nn.Module, SupportsVision):
+      class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+.. note::
+    The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
 
 Here are some examples:
 
@@ -95,7 +123,7 @@ Here are some examples:
     :ref:`input_processing_pipeline`
 
 
-4. (Optional) Register input processor
+5. (Optional) Register input processor
 --------------------------------------
 
 Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. 
@@ -104,15 +132,15 @@ You can register input processors via :meth:`INPUT_REGISTRY.register_input_proce
 
 .. code-block:: diff
 
-    from vllm.inputs import INPUT_REGISTRY
-    from vllm.model_executor.models.interfaces import SupportsVision
-    from vllm.multimodal import MULTIMODAL_REGISTRY
+      from vllm.inputs import INPUT_REGISTRY
+      from vllm.model_executor.models.interfaces import SupportsVision
+      from vllm.multimodal import MULTIMODAL_REGISTRY
 
-    @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
-    @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
-    @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+      @MULTIMODAL_REGISTRY.register_image_input_mapper()
+      @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+      @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
     + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
-    class YourModelForImage2Seq(nn.Module, SupportsVision):
+      class YourModelForImage2Seq(nn.Module, SupportsVision):
 
 A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
 Here are some examples:
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index f9e5dbea1..906f4d054 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -25,13 +25,8 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
 .. important::
     We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified, and internally we will construct data structures for
-    every model to perform profiling with.
-
-    This work is still ongoing. In the meantime, we internally hardcode ``image_feature_size = 3000`` through 
-    :meth:`MULTIMODAL_REGISTRY.get_num_input_tokens <vllm.multimodal.MultiModalRegistry.get_num_input_tokens>` 
-    for every model to be conservative in terms of GPU memory consumption. This hardcoded value will be replaced 
-    with a more accurate profiling strategy in the future.
+    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
+    internally for each model.
 
 
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
@@ -104,13 +99,8 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
 
 .. important::
     We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified, and internally we will construct data structures for
-    every model to perform profiling with.
-
-    This work is still ongoing. In the meantime, we internally hardcode ``image_feature_size = 3000`` through 
-    :meth:`MULTIMODAL_REGISTRY.get_num_input_tokens <vllm.multimodal.MultiModalRegistry.get_num_input_tokens>` 
-    for every model to be conservative in terms of GPU memory consumption. This hardcoded value will be replaced 
-    with a more accurate profiling strategy in the future.
+    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
+    internally for each model.
 
 To consume the server, you can use the OpenAI client like in the example below:
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 2c87e3d92..9396296ff 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -51,7 +51,7 @@ class InputContext:
         additionally checking its type.
 
         Raises:
-            ValueError: If the model is not of the specified type.
+            TypeError: If the model is not of the specified type.
         """
 
         hf_config = self.model_config.hf_config
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 4533e8cbd..d8fbf796b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -35,6 +35,10 @@ def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
                                 patch_size=hf_config.patch_size)
 
 
+def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
+    return get_clip_image_feature_size(hf_config)
+
+
 def dummy_seq_data_for_clip(
     hf_config: CLIPVisionConfig,
     seq_len: int,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 526b080bf..840e40c94 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -21,7 +21,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
-                   input_processor_for_clip)
+                   get_max_clip_image_tokens, input_processor_for_clip)
 from .interfaces import SupportsVision
 from .utils import merge_vision_embeddings
 
@@ -62,6 +62,17 @@ class LlavaImagePixelInputs(TypedDict):
 LlavaImageInputs = LlavaImagePixelInputs
 
 
+def get_max_llava_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return get_max_clip_image_tokens(vision_config)
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
 def dummy_data_for_llava(ctx: InputContext, seq_len: int):
     hf_config = ctx.get_hf_config(LlavaConfig)
     vision_config = hf_config.vision_config
@@ -102,6 +113,7 @@ def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs):
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
 class LlavaForConditionalGeneration(nn.Module, SupportsVision):
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 4b03a5f9f..c37a68978 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -127,6 +127,17 @@ def get_llava_next_image_feature_size(
     raise NotImplementedError(msg)
 
 
+def get_max_llava_next_image_tokens(ctx: InputContext):
+    # Result in the max possible feature size (2x2 grid of 336x336px tiles)
+    dummy_height = dummy_width = 448
+
+    return get_llava_next_image_feature_size(
+        ctx.get_hf_config(LlavaNextConfig),
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
+
+
 def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     hf_config = ctx.get_hf_config(LlavaNextConfig)
     vision_config = hf_config.vision_config
@@ -198,6 +209,7 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 9f12a8b2b..0259960ab 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -321,6 +321,17 @@ def get_phi3v_image_feature_size(
         + (new_height // 336 + 1) * 12
 
 
+def get_max_phi3v_image_tokens(ctx: InputContext):
+    # Result in the max possible feature size (h:w = 16:1)
+    dummy_height, dummy_width = 8000, 50
+
+    return get_phi3v_image_feature_size(
+        ctx.get_hf_config(PretrainedConfig),
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
+
+
 def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
     # Result in the max possible feature size (h:w = 16:1)
     dummy_height, dummy_width = 8000, 50
@@ -429,6 +440,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
 class Phi3VForCausalLM(nn.Module, SupportsVision):
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index e7b45649d..56cee73bd 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -97,9 +97,19 @@ the corresponding plugin with the same modality key is applied.
 """
 
 MultiModalInputMapper = Callable[[InputContext, object], MultiModalInputs]
-"""Return a dictionary to be passed as keyword arguments to
+"""
+Return a dictionary to be passed as keyword arguments to
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
-and processors in HuggingFace Transformers."""
+and processors in HuggingFace Transformers.
+
+If the data is not supported, throw :exc:`TypeError`.
+"""
+
+MultiModalTokensCalc = Union[int, Callable[[InputContext], int]]
+"""
+Calculate the maximum number of multimodal tokens input to the language
+model. This does not include tokens that correspond to the input text.
+"""
 
 N = TypeVar("N", bound=Type[nn.Module])
 
@@ -117,6 +127,7 @@ class MultiModalPlugin(ABC):
 
     def __init__(self) -> None:
         self._input_mappers: Dict[Type[nn.Module], MultiModalInputMapper] = {}
+        self._max_mm_tokens: Dict[Type[nn.Module], MultiModalTokensCalc] = {}
 
     @abstractmethod
     def get_data_key(self) -> str:
@@ -128,9 +139,12 @@ class MultiModalPlugin(ABC):
     @abstractmethod
     def _default_input_mapper(self, ctx: InputContext,
                               data: object) -> MultiModalInputs:
-        """Return a dictionary to be passed as keyword arguments to
+        """
+        Return a dictionary to be passed as keyword arguments to
         :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
+
+        If the data is not supported, throw :exc:`TypeError`.
         """
         raise NotImplementedError
 
@@ -140,9 +154,11 @@ class MultiModalPlugin(ABC):
     ):
         """
         Register an input mapper to a model class.
+
         When the model receives input data that matches the modality served by
-        this plugin (see :meth:`get_data_type`), the provided function is
+        this plugin (see :meth:`get_data_key`), the provided function is
         invoked to transform the data into a dictionary of model inputs.
+
         If `None` is provided, then the default input mapper is used instead.
 
         See also:
@@ -170,10 +186,11 @@ class MultiModalPlugin(ABC):
         Apply an input mapper to a data passed
         to the model, transforming the data into a dictionary of model inputs.
 
-        If the data is not something that the mapper expects, throws TypeError.
-
         The model is identified by ``model_config``.
 
+        Raises:
+            TypeError: If the data type is not supported.
+
         See also:
             :ref:`adding_a_new_multimodal_model`
         """
@@ -188,3 +205,79 @@ class MultiModalPlugin(ABC):
                            f"model class {model_cls.__name__}.")
 
         return mapper(InputContext(model_config), data)
+
+    @abstractmethod
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        """
+        Calculate the maximum number of multimodal tokens input to the language
+        model. This does not include tokens that correspond to the input text.
+        """
+        raise NotImplementedError
+
+    def _validate_max_multimodal_tokens(self, max_mm_tokens: int):
+        if max_mm_tokens < 1:
+            raise ValueError("You should set the number of tokens to a "
+                             f"positive integer. Found: {max_mm_tokens}")
+
+    def register_max_multimodal_tokens(
+        self,
+        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
+    ):
+        """
+        Register the maximum number of multi-modal tokens input to the
+        language model for a model class.
+
+        If `None` is provided, then the default calculation is used instead.
+
+        See also:
+            :ref:`adding_a_new_multimodal_model`
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._max_mm_tokens:
+                logger.warning(
+                    "Model class %s already calculates maximum number of "
+                    "tokens in %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            if isinstance(max_mm_tokens, int):
+                self._validate_max_multimodal_tokens(max_mm_tokens)
+
+            self._max_mm_tokens[model_cls] = max_mm_tokens \
+                or self._default_max_multimodal_tokens
+
+            return model_cls
+
+        return wrapper
+
+    def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+        """
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+
+        If this registry is not applicable to the model, `0` is returned.
+
+        The model is identified by ``model_config``.
+
+        See also:
+            :ref:`adding_a_new_multimodal_model`
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+
+        if model_cls not in self._input_mappers:
+            return 0
+
+        max_mm_tokens = self._max_mm_tokens.get(model_cls)
+        if max_mm_tokens is None:
+            raise KeyError(f"No maximum number of multi-modal tokens is given "
+                           f"for model class {model_cls.__name__} in {self}.")
+
+        if callable(max_mm_tokens):
+            max_mm_tokens = max_mm_tokens(InputContext(model_config))
+
+        self._validate_max_multimodal_tokens(max_mm_tokens)
+
+        return max_mm_tokens
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 27010fa6e..b6c735123 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -130,3 +130,6 @@ class ImagePlugin(MultiModalPlugin):
             raise NotImplementedError("Embeddings input is not supported yet")
 
         raise TypeError(f"Invalid image type: {type(data)}")
+
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 3000
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index bd4583ef5..e0716bbf1 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
 from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
-                   MultiModalPlugin)
+                   MultiModalPlugin, MultiModalTokensCalc)
 from .image import ImagePlugin
 
 logger = init_logger(__name__)
@@ -48,45 +48,24 @@ class MultiModalRegistry:
         msg = f"Unknown multi-modal data type: {data_type_key}"
         raise NotImplementedError(msg)
 
-    def register_image_input_mapper(
+    def register_input_mapper(
         self,
+        data_type_key: str,
         mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
-        Register an input mapper for image data to a model class.
+        Register an input mapper for a specific modality to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input_mapper("image", mapper)
-
-    def _process_input(self, key: str, value: object,
-                       model_config: ModelConfig) -> MultiModalInputs:
-        plugin = self._plugins.get(key)
-        if plugin:
-            return plugin.map_input(model_config, value)
-        msg = f"Unknown multi-modal data type: {key}"
-        raise NotImplementedError(msg)
+        return self._get_plugin(data_type_key).register_input_mapper(mapper)
 
-    def register_input_mapper(
+    def register_image_input_mapper(
         self,
-        data_type: str,
         mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
-        Register an input mapper for a specific modality to a model class.
-
-        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
-        """
-        plugin = self._plugins.get(data_type)
-        if not plugin:
-            msg = f"Unknown multi-modal data type: {data_type}"
-            raise NotImplementedError(msg)
-        return plugin.register_input_mapper(mapper)
-
-    def register_image_input(self,
-                             mapper: Optional[MultiModalInputMapper] = None):
-        """
-        Register an input mapper for image pixel data to a model class.
+        Register an input mapper for image data to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
@@ -102,8 +81,8 @@ class MultiModalRegistry:
         merged_dict: Dict[str, torch.Tensor] = {}
 
         for data_key, data_value in data.items():
-            input_dict = self._process_input(data_key, data_value,
-                                             model_config)
+            input_dict = self._get_plugin(data_key) \
+                .map_input(model_config, data_value)
 
             for input_key, input_tensor in input_dict.items():
                 if input_key in merged_dict:
@@ -121,9 +100,35 @@ class MultiModalRegistry:
         """
         return functools.partial(self.map_input, model_config)
 
-    def get_num_input_tokens(self):
+    def register_max_multimodal_tokens(
+        self,
+        data_type_key: str,
+        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
+    ):
         """
-        Get the number of input tokens for profiling purposes.
+        Register the maximum number of tokens, belonging to a
+        specific modality, input to the language model for a model class.
+        """
+        return self._get_plugin(data_type_key) \
+            .register_max_multimodal_tokens(max_mm_tokens)
+
+    def register_max_image_tokens(
+        self,
+        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
+    ):
+        """
+        Register the maximum number of image tokens
+        input to the language model for a model class.
+        """
+        return self.register_max_multimodal_tokens("image", max_mm_tokens)
+
+    def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+        """
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+        
+        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
         """
-        # TODO: Provide this number on a per model basis.
-        return 3000
+        return sum(
+            plugin.get_max_multimodal_tokens(model_config)
+            for plugin in self._plugins.values())
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2ae5263ba..d0c82d6bb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -820,12 +820,19 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         model_config = self.model_config
 
         if supports_vision(self.model):
-            max_num_seqs = max(
-                1,
-                min(
-                    max_num_seqs,
-                    int(max_num_batched_tokens /
-                        MULTIMODAL_REGISTRY.get_num_input_tokens())))
+            max_mm_tokens = MULTIMODAL_REGISTRY \
+                .get_max_multimodal_tokens(model_config)
+            max_num_seqs_orig = max_num_seqs
+            max_num_seqs = min(max_num_seqs,
+                               max_num_batched_tokens // max_mm_tokens)
+            if max_num_seqs < 1:
+                expr = (f"min({max_num_seqs_orig}, "
+                        f"{max_num_batched_tokens} // {max_mm_tokens})")
+                logger.warning(
+                    "Computed max_num_seqs (%s) to be less than 1. "
+                    "Setting it to the minimum value of 1.", expr)
+                max_num_seqs = 1
+
         batch_size = 0
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index c3a24c89f..03b9cce5a 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -168,14 +168,18 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         model_config = self.model_config
 
         if supports_vision(self.model):
-            # TODO: properly inject these numbers from MultiModalRegistry.
-            # Right now, just use an overly conservative number.
-            max_num_seqs = max(
-                1,
-                min(
-                    max_num_seqs,
-                    int(max_num_batched_tokens /
-                        MULTIMODAL_REGISTRY.get_num_input_tokens())))
+            max_mm_tokens = MULTIMODAL_REGISTRY \
+                .get_max_multimodal_tokens(model_config)
+            max_num_seqs_orig = max_num_seqs
+            max_num_seqs = min(max_num_seqs,
+                               max_num_batched_tokens // max_mm_tokens)
+            if max_num_seqs < 1:
+                expr = (f"min({max_num_seqs_orig}, "
+                        f"{max_num_batched_tokens} // {max_mm_tokens})")
+                logger.warning(
+                    "Computed max_num_seqs (%s) to be less than 1. "
+                    "Setting it to the minimum value of 1.", expr)
+                max_num_seqs = 1
 
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
-- 
GitLab


From a41357e941b81067cd053e94da41adae470990df Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 4 Jul 2024 18:29:47 -0700
Subject: [PATCH 266/376] [VLM] Improve consistency between feature size
 calculation and dummy data for profiling (#6146)

---
 vllm/model_executor/models/llava_next.py | 21 ++++++++-------------
 vllm/model_executor/models/phi3v.py      | 23 ++++++++++-------------
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c37a68978..ce1e9307a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -37,6 +37,9 @@ _KEYS_TO_MODIFY_MAPPING = {
     "language_model.model": "language_model",
 }
 
+# Result in the max possible feature size (2x2 grid of 336x336px tiles)
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
+
 
 class LlavaNextImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -128,13 +131,11 @@ def get_llava_next_image_feature_size(
 
 
 def get_max_llava_next_image_tokens(ctx: InputContext):
-    # Result in the max possible feature size (2x2 grid of 336x336px tiles)
-    dummy_height = dummy_width = 448
 
     return get_llava_next_image_feature_size(
         ctx.get_hf_config(LlavaNextConfig),
-        input_height=dummy_height,
-        input_width=dummy_width,
+        input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
     )
 
 
@@ -142,13 +143,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     hf_config = ctx.get_hf_config(LlavaNextConfig)
     vision_config = hf_config.vision_config
 
-    # Result in the max possible feature size (2x2 grid of 336x336px tiles)
-    dummy_height = dummy_width = 448
-    image_feature_size = get_llava_next_image_feature_size(
-        hf_config,
-        input_height=dummy_height,
-        input_width=dummy_width,
-    )
+    image_feature_size = get_max_llava_next_image_tokens(ctx)
 
     if isinstance(vision_config, CLIPVisionConfig):
         seq_data = dummy_seq_data_for_clip(
@@ -160,8 +155,8 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
 
         mm_data = dummy_image_for_clip(
             vision_config,
-            image_width_override=dummy_width,
-            image_height_override=dummy_height,
+            image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+            image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
         )
 
         return seq_data, mm_data
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0259960ab..b087e485d 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -53,6 +53,10 @@ _KEYS_TO_MODIFY_MAPPING = {
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 32044
 
+# Result in the max possible feature size (h:w = 16:1)
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 50
+
 CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
                                                      hidden_act="quick_gelu",
                                                      hidden_size=1024,
@@ -322,24 +326,17 @@ def get_phi3v_image_feature_size(
 
 
 def get_max_phi3v_image_tokens(ctx: InputContext):
-    # Result in the max possible feature size (h:w = 16:1)
-    dummy_height, dummy_width = 8000, 50
 
     return get_phi3v_image_feature_size(
         ctx.get_hf_config(PretrainedConfig),
-        input_height=dummy_height,
-        input_width=dummy_width,
+        input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
     )
 
 
 def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
-    # Result in the max possible feature size (h:w = 16:1)
-    dummy_height, dummy_width = 8000, 50
-    image_feature_size = get_phi3v_image_feature_size(
-        ctx.get_hf_config(PretrainedConfig),
-        input_height=dummy_height,
-        input_width=dummy_width,
-    )
+
+    image_feature_size = get_max_phi3v_image_tokens(ctx)
 
     seq_data = dummy_seq_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
@@ -349,8 +346,8 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
     )
     mm_data = dummy_image_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        image_width_override=dummy_width,
-        image_height_override=dummy_height,
+        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
 
     return seq_data, mm_data
-- 
GitLab


From ea4b5704833cb31377bcbc2f00959f6c09909099 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 5 Jul 2024 13:49:38 +0800
Subject: [PATCH 267/376] [VLM] Cleanup validation and update docs (#6149)

---
 vllm/model_executor/models/llava.py      | 48 ++++++++-----
 vllm/model_executor/models/llava_next.py | 91 +++++++++++-------------
 vllm/model_executor/models/phi3v.py      | 30 ++++----
 3 files changed, 87 insertions(+), 82 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 840e40c94..250d39687 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -149,14 +149,16 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
                                                 config.vocab_size, logit_scale)
         self.sampler = Sampler()
 
-    def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor:
-        if list(data.shape)[1:] != [
-                3, self.config.vision_config.image_size,
-                self.config.vision_config.image_size
-        ]:
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
             raise ValueError(
-                "The expected image tensor shape is batch dimension plus "
-                "channel, height and width.")
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
 
         return data
 
@@ -173,7 +175,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
         return LlavaImagePixelInputs(
             type="pixel_values",
-            data=self._validate_image_data(pixel_values),
+            data=self._validate_pixel_values(pixel_values),
         )
 
     def _select_image_features(self, image_features: torch.Tensor, *,
@@ -226,18 +228,25 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
         One key thing to understand is the `input_ids` already accounts for the
         positions of the to-be-inserted image embeddings.
+
         Concretely, consider a text prompt:
-        "<image>\nUSER: What's the content of the image?\nASSISTANT:".
+        `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`.
+
         Tokenizer outputs:
-        [1, 32000, 29871, 13, 11889, 29901, 1724, 29915, 29879, 278,
-        2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901].
-        The to-be-inserted image has a size of 576 (24 * 24) along the context
-        length dimension.
-        `input_ids` is thus [1, 32000, ..., 32000, 29871, 13, 11889, 29901,
-        1724, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933,
-        9047, 13566, 29901].
-        There will be 576 `32000` in the `input_ids`.
-        (32000 is the token id for `<image>`.)
+        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
+        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends 
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
+        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
+        29901]`.
+
+        We insert 575 tokens so that including the original image token in the
+        input, there are a total of 576 (24 * 24) image tokens, which
+        corresponds to the number of image tokens inputted to the language
+        model, i.e. the number of image tokens outputted by the visual encoder.
 
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
@@ -246,6 +255,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each input image.
+        
+        See also:
+            :class:`LlavaImageInputs`
         """
         image_input = self._parse_and_validate_image_input(**kwargs)
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index ce1e9307a..7e06f1e95 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -47,7 +47,8 @@ class LlavaNextImagePixelInputs(TypedDict):
     """
     Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different for each batch.
+    Note that `num_patches` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
     """
 
     image_sizes: NotRequired[torch.Tensor]
@@ -255,40 +256,20 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
 
-        def _validate_shape(data: torch.Tensor):
-
-            dim = data.dim()
-            height = width = self.config.vision_config.image_size
-            # All 4d image tensors have the same number of patches,
-            # so data is a 5d batch of these tensors
-            if dim == 5:
-                if list(data.shape)[2:] != [
-                        3, self.config.vision_config.image_size,
-                        self.config.vision_config.image_size
-                ]:
-                    raise ValueError(
-                        "Expected pixel value tensor in shape of: (batch size, "
-                        f"patch number, 3, {height}, {width}), got {data.shape}"
-                    )
-
-            # 4d image tensors have different number of patches,
-            # so data is each individual tensor.
-            elif dim == 4:
-                if list(data.shape)[1:] != [
-                        3, self.config.vision_config.image_size,
-                        self.config.vision_config.image_size
-                ]:
-                    raise ValueError(
-                        "Expected pixel value tensor in shape of: (patch "
-                        f"number, 3, {height}, {width}), got {data.shape}")
-            else:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
                 raise ValueError(
-                    f"Invalid pixel value tensor of shape {data.shape}")
+                    "The expected shape of pixel values in each batch element "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
 
-        if isinstance(data, torch.Tensor):
-            _validate_shape(data)
-        else:
-            [_validate_shape(d) for d in data]
+        for d in data:
+            _validate_shape(d)
 
         return data
 
@@ -464,18 +445,33 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
         One key thing to understand is the `input_ids` already accounts for the
         positions of the to-be-inserted image embeddings.
+
         Concretely, consider a text prompt:
-        "<image>\nUSER: What's the content of the image?\nASSISTANT:".
+        `"A chat between a curious human and an artificial intelligence
+        assistant. The assistant gives helpful, detailed, and polite answers to
+        the human's questions.
+        USER: <image>\\nWhat is shown in this image? ASSISTANT:"`.
+
         Tokenizer outputs:
-        [1, 32000, 29871, 13, 11889, 29901, 1724, 29915, 29879, 278,
-        2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901].
-        The to-be-inserted image has a size of 576 (24 * 24) along the context
-        length dimension.
-        `input_ids` is thus [1, 32000, ..., 32000, 29871, 13, 11889, 29901,
-        1724, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933,
-        9047, 13566, 29901].
-        There will be 576 `32000` in the `input_ids`.
-        (32000 is the token id for `<image>`.)
+        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
+        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
+        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
+        29871, 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973, 319, 1799,
+        9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends 
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
+        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
+        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
+        29871, 32000, ..., 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973,
+        319, 1799, 9047, 13566, 29901]`.
+
+        Unlike in LLaVA-1.5, the number of image tokens inputted to the language
+        model depends on the original size of the input image. Including the
+        original image token in the input, the required number of image tokens
+        is given by :func:`get_llava_next_image_feature_size`.
 
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
@@ -484,15 +480,10 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each grid patch for each input image.
-                Expects a batch with shape `[1, num_patches, 3, h, w]`.
             image_sizes: The original `(height, width)` for each input image.
-                Expects a batch with shape `[1, 2]`.
-
+        
         See also:
-            Each input maps to huggingface implementation, as follows:
-
-            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L690
-            - `image_sizes`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L691
+            :class:`LlavaNextImageInputs`
         """
         image_input = self._parse_and_validate_image_input(**kwargs)
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b087e485d..1c6bd106b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -263,7 +263,8 @@ class Phi3VImagePixelInputs(TypedDict):
     """
     Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different for each batch.
+    Note that `num_patches` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
     """
 
     image_sizes: torch.Tensor
@@ -466,8 +467,8 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         if list(data.shape[1:]) != [2]:
             raise ValueError(
-                f"The expected image sizes shape is batch dimension plus "
-                f"{[2]}. You supplied {data.shape}.")
+                f"The expected shape of image sizes is batch dimension plus "
+                f"{[2]}. You supplied {tuple(data.shape)}.")
 
         return data
 
@@ -475,19 +476,20 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
 
-        def _validate_shape(data: torch.Tensor):
-            if list(data.shape)[2:] != [
-                    3, CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size,
-                    CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size
-            ]:
+        h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
                 raise ValueError(
-                    "The expected pixel value tensor shape is batch dimension "
-                    "plus patch number, channel, height and width.")
+                    "The expected shape of pixel values in each batch element "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
 
-        if isinstance(data, torch.Tensor):
-            _validate_shape(data)
-        else:
-            [_validate_shape(d) for d in data]
+        for d in data:
+            _validate_shape(d)
 
         return data
 
-- 
GitLab


From 0097bb1829310577262e2a79ae8b498765b39225 Mon Sep 17 00:00:00 2001
From: Christian Rohmann <frittentheke@users.noreply.github.com>
Date: Fri, 5 Jul 2024 18:49:47 +0200
Subject: [PATCH 268/376] [Bugfix] Use templated datasource in grafana.json to
 allow automatic imports (#6136)

Signed-off-by: Christian Rohmann <christian.rohmann@inovex.de>
---
 examples/production_monitoring/grafana.json | 27 ++++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json
index 273f7f5ac..d1389f539 100644
--- a/examples/production_monitoring/grafana.json
+++ b/examples/production_monitoring/grafana.json
@@ -1,13 +1,5 @@
 {
   "__inputs": [
-    {
-      "name": "DS_PROMETHEUS",
-      "label": "prometheus",
-      "description": "",
-      "type": "datasource",
-      "pluginId": "prometheus",
-      "pluginName": "Prometheus"
-    }
   ],
   "__elements": {},
   "__requires": [
@@ -1215,11 +1207,21 @@
   "templating": {
     "list": [
       {
+        "type": "datasource",
+        "name": "DS_PROMETHEUS",
+        "label": "datasource",
         "current": {},
-        "datasource": {
-          "type": "prometheus",
-          "uid": "${DS_PROMETHEUS}"
-        },
+        "hide": 0,
+        "includeAll": false,
+        "multi": false,
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false
+      },
+      {
         "definition": "label_values(model_name)",
         "hide": 0,
         "includeAll": false,
@@ -1250,3 +1252,4 @@
   "version": 1,
   "weekStart": ""
 }
+
-- 
GitLab


From f1e15da6fe20ff17d5b8c28f37487cee38f08b83 Mon Sep 17 00:00:00 2001
From: jvlunteren <161835099+jvlunteren@users.noreply.github.com>
Date: Fri, 5 Jul 2024 19:37:09 +0200
Subject: [PATCH 269/376] [Frontend] Continuous usage stats in OpenAI
 completion API (#5742)

---
 tests/entrypoints/openai/test_completion.py   | 112 +++++++++++++++---
 vllm/entrypoints/openai/protocol.py           |   3 +-
 vllm/entrypoints/openai/serving_completion.py |  26 ++--
 3 files changed, 110 insertions(+), 31 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 81f5254d9..52a848b78 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -295,25 +295,49 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
                                          model_name: str):
     prompt = "What is the capital of France?"
 
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
     async for chunk in stream:
         assert chunk.usage is None
 
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
@@ -328,7 +352,36 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
                 final_chunk.usage.completion_tokens)
             assert final_chunk.choices == []
 
-    # Test stream=False, stream_options={"include_usage": None}
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
     with pytest.raises(BadRequestError):
         await client.completions.create(model=model_name,
                                         prompt=prompt,
@@ -337,7 +390,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
                                         stream=False,
                                         stream_options={"include_usage": None})
 
-    # Test stream=False, stream_options={"include_usage": True}
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
     with pytest.raises(BadRequestError):
         await client.completions.create(model=model_name,
                                         prompt=prompt,
@@ -346,6 +400,28 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
                                         stream=False,
                                         stream_options={"include_usage": True})
 
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 7f97e534e..881e2675c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -103,7 +103,8 @@ class ResponseFormat(OpenAIBaseModel):
 
 
 class StreamOptions(OpenAIBaseModel):
-    include_usage: Optional[bool]
+    include_usage: Optional[bool] = True
+    continuous_usage_stats: Optional[bool] = True
 
 
 class FunctionDefinition(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1bd095655..55cd01579 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -271,16 +271,6 @@ class OpenAIServingCompletion(OpenAIServing):
                     previous_num_tokens[i] = len(output.token_ids)
                     finish_reason = output.finish_reason
                     stop_reason = output.stop_reason
-                    if output.finish_reason is not None:  # return final usage
-                        prompt_tokens = len(res.prompt_token_ids)
-                        completion_tokens = len(output.token_ids)
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens,
-                            total_tokens=prompt_tokens + completion_tokens,
-                        )
-                    else:
-                        final_usage = None
 
                     chunk = CompletionStreamResponse(
                         id=request_id,
@@ -297,7 +287,19 @@ class OpenAIServingCompletion(OpenAIServing):
                         ])
                     if (request.stream_options
                             and request.stream_options.include_usage):
-                        chunk.usage = None
+                        if (request.stream_options.continuous_usage_stats
+                                or output.finish_reason is not None):
+                            prompt_tokens = len(res.prompt_token_ids)
+                            completion_tokens = len(output.token_ids)
+                            usage = UsageInfo(
+                                prompt_tokens=prompt_tokens,
+                                completion_tokens=completion_tokens,
+                                total_tokens=prompt_tokens + completion_tokens,
+                            )
+                        if request.stream_options.continuous_usage_stats:
+                            chunk.usage = usage
+                        else:
+                            chunk.usage = None
 
                     response_json = chunk.model_dump_json(exclude_unset=True)
                     yield f"data: {response_json}\n\n"
@@ -309,7 +311,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     created=created_time,
                     model=model_name,
                     choices=[],
-                    usage=final_usage,
+                    usage=usage,
                 )
                 final_usage_data = (final_usage_chunk.model_dump_json(
                     exclude_unset=True, exclude_none=True))
-- 
GitLab


From e58294ddf23107d93987c00611d63a20e3cfe771 Mon Sep 17 00:00:00 2001
From: JGSweets <JGSweets@users.noreply.github.com>
Date: Fri, 5 Jul 2024 12:41:01 -0500
Subject: [PATCH 270/376] [Bugfix] Add verbose error if scipy is missing for
 blocksparse attention (#5695)

---
 .../ops/blocksparse_attention/utils.py        | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index 0d90dd971..b1808970d 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -6,7 +6,14 @@ from functools import lru_cache
 
 import torch
 import triton
-from scipy import sparse
+
+try:
+    from scipy import sparse
+except ImportError as err:
+    raise ImportError("Please install scipy via "
+                      "`pip install scipy` to use "
+                      "BlockSparseAttention in "
+                      "models such as Phi-3.") from err
 
 
 def dense_to_crow_col(x: torch.Tensor):
@@ -77,11 +84,11 @@ def _get_sparse_attn_mask_homo_head(
 ):
     """
     :return: a tuple of 3:
-        - tuple of crow_indices, col_indices representation 
+        - tuple of crow_indices, col_indices representation
             of CSR format.
         - block dense mask
-        - all token dense mask (be aware that it can be 
-            OOM if it is too big) if `return_dense==True`, 
+        - all token dense mask (be aware that it can be
+            OOM if it is too big) if `return_dense==True`,
             otherwise, None
     """
     with torch.no_grad():
@@ -148,10 +155,10 @@ def get_sparse_attn_mask(
     :param dense_mask_type: "binary" (0 for skip token, 1 for others)
         or "bias" (-inf for skip token, 0 or others)
     :return: a tuple of 3:
-        - tuple of crow_indices, col_indices representation 
+        - tuple of crow_indices, col_indices representation
             of CSR format.
         - block dense mask
-        - all token dense mask (be aware that it can be OOM if it 
+        - all token dense mask (be aware that it can be OOM if it
             is too big) if `return_dense==True`, otherwise, None
     """
     assert dense_mask_type in ("binary", "bias")
-- 
GitLab


From abad5746a71dc87fb2b1d43cae645dcda79b64a2 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 5 Jul 2024 12:04:51 -0700
Subject: [PATCH 271/376] bump version to v0.5.1 (#6157)

---
 vllm/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/version.py b/vllm/version.py
index 2b33ffcf5..dd9b22ccc 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1 +1 @@
-__version__ = "0.5.0.post1"
+__version__ = "0.5.1"
-- 
GitLab


From 79d406e9183aa12cdef6f1876eb9a15385662587 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 5 Jul 2024 12:44:40 -0700
Subject: [PATCH 272/376] [Docs] Fix readthedocs for tag build (#6158)

---
 docs/source/conf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 46a3bcbf1..f4cec0566 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -75,7 +75,10 @@ if READTHEDOCS_VERSION_TYPE == "tag":
     # remove the warning banner if the version is a tagged release
     header_file = os.path.join(os.path.dirname(__file__),
                                "_templates/sections/header.html")
-    os.remove(header_file)
+    # The file might be removed already if the build is triggered multiple times
+    # (readthedocs build both HTML and PDF versions separately)
+    if os.path.exists(header_file):
+        os.remove(header_file)
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
-- 
GitLab


From 2de490d60f281c8d4b182bf5551d27c141bd742d Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 5 Jul 2024 14:51:25 -0700
Subject: [PATCH 273/376] Update wheel builds to strip debug (#6161)

---
 .buildkite/release-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 1959f9752..26c0480cd 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -5,7 +5,7 @@ steps:
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image -e CMAKE_BUILD_TYPE=Release --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-- 
GitLab


From f0250620dd2e33087293cf89b96db854767b939b Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 5 Jul 2024 16:24:31 -0700
Subject: [PATCH 274/376] Fix release wheel build env var (#6162)

---
 .buildkite/release-pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 26c0480cd..ca759d522 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -5,9 +5,9 @@ steps:
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image -e CMAKE_BUILD_TYPE=Release --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host -e CMAKE_BUILD_TYPE=Release vllm-ci:build-image cp -r dist /artifacts_host"
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
     matrix:
       setup:
-- 
GitLab


From bc96d5c330e079fa501eee05e97bf15009c9a094 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 5 Jul 2024 17:19:53 -0700
Subject: [PATCH 275/376] Move release wheel env var to Dockerfile instead
 (#6163)

---
 .buildkite/release-pipeline.yaml | 4 ++--
 Dockerfile                       | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index ca759d522..c394f3fd7 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,13 +1,13 @@
 steps:
   - block: "Build wheels"
 
-  - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" 
+  - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}"
     agents:
       queue: cpu_queue
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host -e CMAKE_BUILD_TYPE=Release vllm-ci:build-image cp -r dist /artifacts_host"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
     matrix:
       setup:
diff --git a/Dockerfile b/Dockerfile
index f571e8be4..feb004513 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -99,6 +99,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
         && export SCCACHE_BUCKET=vllm-build-sccache \
         && export SCCACHE_REGION=us-west-2 \
+        && export CMAKE_BUILD_TYPE=Release \
         && sccache --show-stats \
         && python3 setup.py bdist_wheel --dist-dir=dist \
         && sccache --show-stats; \
-- 
GitLab


From 175c43eca4e6a50e160c386c6668ae4645c0b5d1 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 5 Jul 2024 22:59:36 -0700
Subject: [PATCH 276/376] [Doc] Reorganize Supported Models by Type (#6167)

---
 docs/source/models/supported_models.rst | 51 ++++++++++++++++---------
 docs/source/models/vlm.rst              |  3 +-
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 0283f36ea..f5511580d 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo
 The following is the list of model architectures that are currently supported by vLLM.
 Alongside each architecture, we include some popular models that use it.
 
+Decoder-only Language Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. list-table::
   :widths: 25 25 50 5
   :header-rows: 1
@@ -95,14 +97,6 @@ Alongside each architecture, we include some popular models that use it.
     - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
     - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
     - ✅︎
-  * - :code:`LlavaForConditionalGeneration`
-    - LLaVA-1.5
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
-    -
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT
-    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-    -
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
     - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
@@ -143,10 +137,6 @@ Alongside each architecture, we include some popular models that use it.
     - Phi-3-Small
     - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
     -
-  * - :code:`Phi3VForCausalLM`
-    - Phi-3-Vision
-    - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
-    -
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
@@ -172,14 +162,40 @@ Alongside each architecture, we include some popular models that use it.
     - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
     -
 
+.. note::
+    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
+
+.. _supported_vlms:
+
+Vision Language Models
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+  :widths: 25 25 50 5
+  :header-rows: 1
+
+  * - Architecture
+    - Models
+    - Example HuggingFace Models
+    - :ref:`LoRA <lora>`
+  * - :code:`LlavaForConditionalGeneration`
+    - LLaVA-1.5
+    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
+    -
+  * - :code:`LlavaNextForConditionalGeneration`
+    - LLaVA-NeXT
+    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
+    -
+  * - :code:`Phi3VForCausalLM`
+    - Phi-3-Vision
+    - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
+    -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
-Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
+Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Adding a New Multimodal Model <adding_a_new_multimodal_model>` 
+for instructions on how to implement support for your model.
 Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
 
-.. note::
-    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
-
 .. tip::
     The easiest way to check if your model is supported is to run the program below:
 
@@ -210,8 +226,9 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
         output = llm.generate("Hello, my name is")
         print(output)
 
+
 Model Support Policy
----------------------
+=====================
 
 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
 
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 906f4d054..d488b0fef 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -3,7 +3,8 @@
 Using VLMs
 ==========
 
-vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
+vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here <supported_vlms>`.
+This document shows you how to run and serve these models using vLLM.
 
 .. important::
     We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
-- 
GitLab


From 9389380015b80c109b899a08840132780b9b3fc0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 6 Jul 2024 17:18:59 +0800
Subject: [PATCH 277/376] [Doc] Move guide for multimodal model and other
 improvements (#6168)

---
 .../input_processing/model_inputs_index.rst   |  6 +--
 .../dev/multimodal/multimodal_index.rst       | 20 +++-----
 docs/source/index.rst                         |  2 +
 docs/source/models/adding_model.rst           | 38 ++++++++-------
 .../enabling_multimodal_inputs.rst}           | 47 +++++++++----------
 docs/source/models/supported_models.rst       |  2 +-
 vllm/inputs/registry.py                       |  2 +-
 vllm/multimodal/base.py                       | 11 +++--
 8 files changed, 61 insertions(+), 67 deletions(-)
 rename docs/source/{dev/multimodal/adding_multimodal_model.rst => models/enabling_multimodal_inputs.rst} (78%)

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
index 2dde251aa..5d8958375 100644
--- a/docs/source/dev/input_processing/model_inputs_index.rst
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -5,10 +5,10 @@ Input Processing
 
 .. currentmodule:: vllm.inputs
 
-vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
-in :class:`~vllm.LLMEngine` before they are passed to model executors. 
+Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
+:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
 
-Currently, this mechanism is only utilized in :ref:`multi-modal models <multi_modality>` for preprocessing multi-modal input 
+Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input 
 data in addition to input prompt, but it can be extended to text-only language models when needed.
 
 Guides
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index c2d1b771e..39daf30a3 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -7,25 +7,17 @@ Multi-Modality
     
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 
-:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
-which allows you to pass in multi-modal input alongside text and token prompts.
+Multi-modal input can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
+via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
 
 .. note::
    ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through 
-    :class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
+   the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
 
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`the guide for adding a new multimodal model. <adding_a_new_multimodal_model>`.
+To implement a new multi-modal model in vLLM, please follow :ref:`this guide <enabling_multimodal_inputs>`.
 
-
-# TODO: Add more instructions on how to do that once embeddings is in.
-
-Guides
-++++++
-
-.. toctree::
-   :maxdepth: 1
-
-   adding_multimodal_model
+..
+  TODO: Add more instructions on how to add new plugins once embeddings is in.
 
 Module Contents
 +++++++++++++++
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e99a0a9a1..67c039f25 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -92,6 +92,7 @@ Documentation
 
    models/supported_models
    models/adding_model
+   models/enabling_multimodal_inputs
    models/engine_args
    models/lora
    models/vlm
@@ -116,6 +117,7 @@ Documentation
    automatic_prefix_caching/details
 
 .. toctree::
+   :maxdepth: 2
    :caption: Developer Documentation
 
    dev/sampling_params
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index f282b5945..53c19e582 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -10,6 +10,10 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
     The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
     However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
 
+.. note::
+    By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
+    please follow :ref:`this guide <enabling_multimodal_inputs>` after implementing the model here.
+
 .. tip::
     If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
     We will be happy to help you out!
@@ -44,23 +48,23 @@ Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your mo
 
 .. code-block:: diff
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-    -    attention_mask: Optional[torch.Tensor] = None,
-    -    position_ids: Optional[torch.LongTensor] = None,
-    -    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    -    inputs_embeds: Optional[torch.FloatTensor] = None,
-    -    labels: Optional[torch.LongTensor] = None,
-    -    use_cache: Optional[bool] = None,
-    -    output_attentions: Optional[bool] = None,
-    -    output_hidden_states: Optional[bool] = None,
-    -    return_dict: Optional[bool] = None,
-    -) -> Union[Tuple, CausalLMOutputWithPast]:
-    +    positions: torch.Tensor,
-    +    kv_caches: List[torch.Tensor],
-    +    attn_metadata: AttentionMetadata,
-    +) -> Optional[SamplerOutput]:
+      def forward(
+          self,
+          input_ids: torch.Tensor,
+    -     attention_mask: Optional[torch.Tensor] = None,
+    -     position_ids: Optional[torch.LongTensor] = None,
+    -     past_key_values: Optional[List[torch.FloatTensor]] = None,
+    -     inputs_embeds: Optional[torch.FloatTensor] = None,
+    -     labels: Optional[torch.LongTensor] = None,
+    -     use_cache: Optional[bool] = None,
+    -     output_attentions: Optional[bool] = None,
+    -     output_hidden_states: Optional[bool] = None,
+    -     return_dict: Optional[bool] = None,
+    - ) -> Union[Tuple, CausalLMOutputWithPast]:
+    +     positions: torch.Tensor,
+    +     kv_caches: List[torch.Tensor],
+    +     attn_metadata: AttentionMetadata,
+    + ) -> Optional[SamplerOutput]:
 
 1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
 2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/models/enabling_multimodal_inputs.rst
similarity index 78%
rename from docs/source/dev/multimodal/adding_multimodal_model.rst
rename to docs/source/models/enabling_multimodal_inputs.rst
index 32f62003f..20be920b5 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/models/enabling_multimodal_inputs.rst
@@ -1,26 +1,21 @@
-.. _adding_a_new_multimodal_model:
+.. _enabling_multimodal_inputs:
 
-Adding a New Multimodal Model
-=============================
+Enabling Multimodal Inputs
+==========================
 
-This document provides a high-level guide on integrating a :ref:`multi-modal model <multi_modality>` into vLLM.
+This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal <multi_modality>` inputs.
 
-.. note::
-    The complexity of adding a new model depends heavily on the model's architecture.
-    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-
-.. tip::
-    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
-    We will be happy to help you out!
+.. seealso::
+    :ref:`adding_a_new_model`
 
 
-1. Set up the base vLLM model
+1. Update the base vLLM model
 -----------------------------
 
-As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model in vLLM, but note the following:
+It is assumed that you have already implemented the model in vLLM according to :ref:`these steps <adding_a_new_model>`.
+Further update the model as follows:
 
-- You should additionally implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
+- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
 
   .. code-block:: diff
 
@@ -33,19 +28,19 @@ As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model
       The model class does not have to be named :code:`*ForCausalLM`.
       Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
 
-- While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
+- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward`
   for each input tensor that corresponds to a multi-modal input, as shown in the following example:
 
   .. code-block:: diff
 
-      def forward(
-          self,
-          input_ids: torch.Tensor,
-          positions: torch.Tensor,
-          kv_caches: List[torch.Tensor],
-          attn_metadata: AttentionMetadata,
-      +   pixel_values: torch.Tensor,
-      ) -> SamplerOutput:
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: List[torch.Tensor],
+            attn_metadata: AttentionMetadata,
+      +     pixel_values: torch.Tensor,
+        ) -> SamplerOutput:
 
 
 2. Register input mappers
@@ -68,8 +63,8 @@ A default mapper is available for each modality in the core vLLM library. This i
     :ref:`input_processing_pipeline`
 
 
-3. Register maximum number of multimodal tokens
-----------------------------------------------------------
+3. Register maximum number of multi-modal tokens
+------------------------------------------------
 
 For each modality type that the model accepts as input, calculate the maximum possible number of tokens
 and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f5511580d..e64a07239 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -192,7 +192,7 @@ Vision Language Models
     -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
-Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Adding a New Multimodal Model <adding_a_new_multimodal_model>` 
+Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
 for instructions on how to implement support for your model.
 Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 9396296ff..4a7e5c583 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -141,7 +141,7 @@ class InputRegistry:
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`adding_a_new_multimodal_model`
+            :ref:`enabling_multimodal_inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 56cee73bd..0e31816a8 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -162,8 +162,8 @@ class MultiModalPlugin(ABC):
         If `None` is provided, then the default input mapper is used instead.
 
         See also:
-            :ref:`input_processing_pipeline`
-            :ref:`adding_a_new_multimodal_model`
+            - :ref:`input_processing_pipeline`
+            - :ref:`enabling_multimodal_inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -192,7 +192,8 @@ class MultiModalPlugin(ABC):
             TypeError: If the data type is not supported.
 
         See also:
-            :ref:`adding_a_new_multimodal_model`
+            - :ref:`input_processing_pipeline`
+            - :ref:`enabling_multimodal_inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
@@ -230,7 +231,7 @@ class MultiModalPlugin(ABC):
         If `None` is provided, then the default calculation is used instead.
 
         See also:
-            :ref:`adding_a_new_multimodal_model`
+            :ref:`enabling_multimodal_inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -260,7 +261,7 @@ class MultiModalPlugin(ABC):
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`adding_a_new_multimodal_model`
+            :ref:`enabling_multimodal_inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
-- 
GitLab


From 6206dcb29eb99b3eebf5f00c97a5690c9b7df4f1 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 6 Jul 2024 18:25:50 -0700
Subject: [PATCH 278/376] [Model] Add PaliGemma (#5189)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 docs/source/models/supported_models.rst |   4 +
 examples/paligemma_example.py           |  52 ++++
 tests/models/test_paligemma.py          | 147 ++++++++++
 vllm/model_executor/models/__init__.py  |   2 +
 vllm/model_executor/models/gemma.py     |  10 +-
 vllm/model_executor/models/paligemma.py | 344 ++++++++++++++++++++++++
 6 files changed, 557 insertions(+), 2 deletions(-)
 create mode 100644 examples/paligemma_example.py
 create mode 100644 tests/models/test_paligemma.py
 create mode 100644 vllm/model_executor/models/paligemma.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index e64a07239..f56679c3c 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -186,6 +186,10 @@ Vision Language Models
     - LLaVA-NeXT
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
+  * - :code:`PaliGemmaForConditionalGeneration`
+    - PaliGemma
+    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
+    - 
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision
     - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py
new file mode 100644
index 000000000..b315eafe5
--- /dev/null
+++ b/examples/paligemma_example.py
@@ -0,0 +1,52 @@
+import os
+import subprocess
+
+from PIL import Image
+
+from vllm import LLM
+
+# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
+# You can use `.buildkite/download-images.sh` to download them
+
+
+def run_paligemma():
+    llm = LLM(model="google/paligemma-3b-mix-224")
+
+    prompt = "caption es"
+
+    image = Image.open("images/stop_sign.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": image
+        },
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def main():
+    run_paligemma()
+
+
+if __name__ == "__main__":
+    # Download from s3
+    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
+    local_directory = "images"
+
+    # Make sure the local directory exists or create it
+    os.makedirs(local_directory, exist_ok=True)
+
+    # Use AWS CLI to sync the directory, assume anonymous access
+    subprocess.check_call([
+        "aws",
+        "s3",
+        "sync",
+        s3_bucket_path,
+        local_directory,
+        "--no-sign-request",
+    ])
+    main()
diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py
new file mode 100644
index 000000000..2b1d3c5b4
--- /dev/null
+++ b/tests/models/test_paligemma.py
@@ -0,0 +1,147 @@
+from typing import List, Optional, Tuple, Type
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign": "caption es",
+    "cherry_blossom": "What is in the picture?",
+    "boardwalk": "What is in the picture?",
+})
+
+IMAGE_TOKEN_ID = 257152
+
+models = ["google/paligemma-3b-mix-224"]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+    ]
+
+    hf_output_str = output_str
+
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index a4fe18d52..644b95aae 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -49,6 +49,8 @@ _GENERATION_MODELS = {
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
+    "PaliGemmaForConditionalGeneration":
+    ("paligemma", "PaliGemmaForConditionalGeneration"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index b603a5911..16548c6c1 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -268,16 +268,22 @@ class GemmaModel(nn.Module):
         normalizer = self.config.hidden_size**0.5
         self.register_buffer("normalizer", torch.tensor(normalizer))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
         hidden_states *= self.normalizer
-
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
new file mode 100644
index 000000000..2af2bedd8
--- /dev/null
+++ b/vllm/model_executor/models/paligemma.py
@@ -0,0 +1,344 @@
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+from PIL import Image
+from torch import nn
+from transformers import PaliGemmaConfig, SiglipVisionConfig, SiglipVisionModel
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.gemma import GemmaModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import cached_get_tokenizer
+from vllm.sequence import SamplerOutput, SequenceData
+
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+logger = init_logger(__name__)
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.model": "language_model",
+}
+
+
+def get_max_paligemma_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(PaliGemmaConfig)
+    text_config = hf_config.text_config
+
+    return text_config.num_image_tokens
+
+
+def dummy_seq_data_for_paligemma(
+    hf_config: PaliGemmaConfig,
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = hf_config.text_config.num_image_tokens
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_paligemma(
+    hf_config: SiglipVisionConfig,
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def dummy_data_for_paligemma(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(PaliGemmaConfig)
+    vision_config = hf_config.vision_config
+
+    seq_data = dummy_seq_data_for_paligemma(
+        hf_config,
+        seq_len,
+        image_token_id=hf_config.image_token_index,
+    )
+
+    mm_data = dummy_image_for_paligemma(vision_config)
+    return seq_data, mm_data
+
+
+def input_processor_for_paligemma(ctx: InputContext, llm_inputs: LLMInputs):
+
+    """
+    The correct prompt format needs to be:
+    '<image>' * image_feature_size + '<bos>' + prompt + '\n'
+
+    See https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/models/paligemma/processing_paligemma.py#L55
+    """ # noqa
+
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PaliGemmaConfig)
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+    image_feature_size = hf_config.text_config.num_image_tokens
+    image_token_str = tokenizer.decode(hf_config.image_token_index)
+    bos_token = tokenizer.decode(hf_config.bos_token_id)
+    image_token_str_pad = image_token_str * image_feature_size
+    image_token_ids_pad = [hf_config.image_token_index] * image_feature_size
+
+    orig_prompt = llm_inputs.get("prompt")
+    orig_prompt_ids = llm_inputs.get("prompt_token_ids")
+
+    if image_token_str in orig_prompt:
+        logger.warning(
+            "The image token '%s' was detected in the prompt and "
+            "will be removed. Please follow the proper prompt format"
+            " documented on HuggingFace.", image_token_str)
+        orig_prompt = orig_prompt.replace(image_token_str, "")
+        orig_prompt_ids.remove(hf_config.image_token_index)
+
+    new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
+    new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
+class PaliGemmaMultiModalProjector(nn.Module):
+
+    def __init__(self, vision_hidden_size: int, projection_dim: int):
+        super().__init__()
+
+        self.linear = ColumnParallelLinear(vision_hidden_size,
+                                           projection_dim,
+                                           bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear(image_features)
+        return hidden_states
+
+
+class PaliGemmaImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, num_channels, height, width)"""
+
+
+PaliGemmaImageInputs = PaliGemmaImagePixelInputs
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_paligemma_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_paligemma)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma)
+class PaliGemmaForConditionalGeneration(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: PaliGemmaConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO(ywang96): Port over SiglipVisionModel & TP
+        self.vision_tower = SiglipVisionModel(config.vision_config)
+        self.multi_modal_projector = PaliGemmaMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            projection_dim=config.vision_config.projection_dim)
+
+        self.quant_config = quant_config
+        self.language_model = GemmaModel(config.text_config, cache_config,
+                                         quant_config)
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[PaliGemmaImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return PaliGemmaImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
+                                  pixel_values: torch.Tensor) -> torch.Tensor:
+
+        image_outputs = vision_tower(pixel_values, output_hidden_states=True)
+
+        selected_image_features = image_outputs.last_hidden_state
+
+        return selected_image_features
+
+    def _process_image_pixels(
+            self, inputs: PaliGemmaImagePixelInputs) -> torch.Tensor:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["data"]
+
+        return self._image_pixels_to_features(self.vision_tower, pixel_values)
+
+    def _process_image_input(
+            self, image_input: PaliGemmaImageInputs) -> torch.Tensor:
+
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
+
+        return self.multi_modal_projector(image_features)
+
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                attn_metadata: AttentionMetadata,
+                **kwargs: object) -> SamplerOutput:
+
+        parsed_image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if parsed_image_input is not None:
+            vision_embeddings = self._process_image_input(parsed_image_input)
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
+            vision_embeddings = vision_embeddings * (self.config.hidden_size**
+                                                     -0.5)
+
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            inputs_embeds = merge_vision_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.config.image_token_index)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    # Copied from vllm/model_executor/models/gemma.py
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.language_model.embed_tokens,
+                                       hidden_states, sampling_metadata)
+        return logits
+
+    # Copied from vllm/model_executor/models/gemma.py
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    # Adapted from vllm/model_executor/models/gemma.py
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        for name, loaded_weight in weights:
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, shard_name,
+                     shard_id) in stacked_params_mapping:
+                    if shard_name not in name:
+                        continue
+                    name = name.replace(shard_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    # lm_head is not used in vllm as it is tied with
+                    # embed_token. To prevent errors, skip loading
+                    # lm_head.weight.
+                    if "lm_head.weight" in name:
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    use_default_weight_loading = True
+
+            if use_default_weight_loading:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                "Some weights are not initialized from checkpoints: "
+                f"{unloaded_params}")
-- 
GitLab


From 333306a252a0ef42ef07b11b64df6ff1ac2e2d6d Mon Sep 17 00:00:00 2001
From: Haichuan <1778876540@qq.com>
Date: Sun, 7 Jul 2024 15:42:13 +0800
Subject: [PATCH 279/376] add benchmark for fix length input and output (#5857)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 benchmarks/benchmark_serving.py | 65 ++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 42867fc40..dbcb9743b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -17,7 +17,7 @@ On the client side, run:
         --dataset-path <path to dataset> \
         --request-rate <request_rate> \ # By default <request_rate> is inf
         --num-prompts <num_prompts> # By default <num_prompts> is 1000
-        
+
     when using tgi backend, add
         --endpoint /generate_stream
     to the end of the command above.
@@ -77,7 +77,6 @@ def sample_sharegpt_requests(
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
-
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
@@ -185,6 +184,31 @@ def sample_sonnet_requests(
     return sampled_requests
 
 
+def sample_random_requests(
+        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
+        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
+
+    input_lens = np.random.randint(
+        int(input_len * range_ratio),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+    input_requests = []
+    for i in range(args.num_prompts):
+        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
+                                   for j in range(input_lens[i])])
+        input_requests.append(
+            (prompt, int(input_lens[i]), int(output_lens[i])))
+
+    return input_requests
+
+
 async def get_request(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
@@ -196,6 +220,7 @@ async def get_request(
         if request_rate == float("inf"):
             # If the request rate is infinity, then we don't need to wait.
             continue
+
         # Sample the request interval from the exponential distribution.
         interval = np.random.exponential(1.0 / request_rate)
         # The next request will be sent after the interval.
@@ -219,7 +244,7 @@ def calculate_metrics(
             # We use the tokenizer to count the number of output tokens for all
             # serving backends instead of looking at len(outputs[i].itl) since
             # multiple output tokens may be bundled together
-            # Note: this may inflate the output token count slightly
+            # Note : this may inflate the output token count slightly
             output_len = len(
                 tokenizer(outputs[i].generated_text,
                           add_special_tokens=False).input_ids)
@@ -456,6 +481,15 @@ def main(args: argparse.Namespace):
                               for prompt, prompt_formatted, prompt_len,
                               output_len in input_requests]
 
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.input_len,
+            output_len=args.output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.range_ratio,
+            tokenizer=tokenizer,
+        )
+
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
 
@@ -549,7 +583,7 @@ if __name__ == "__main__":
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "sonnet"],
+        choices=["sharegpt", "sonnet", "random"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument("--dataset-path",
@@ -566,7 +600,7 @@ if __name__ == "__main__":
         "--tokenizer",
         type=str,
         help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
     parser.add_argument(
         "--best-of",
@@ -609,6 +643,27 @@ if __name__ == "__main__":
         help=
         "Number of prefix tokens per request, used only for sonnet dataset.",
     )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
     parser.add_argument(
         "--request-rate",
         type=float,
-- 
GitLab


From abfe705a02160db53f4b0cf90c7b016f04291b9c Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 7 Jul 2024 16:42:11 -0400
Subject: [PATCH 280/376] [ Misc ] Support Fp8 via `llm-compressor` (#6110)

Co-authored-by: Robert Shaw <rshaw@neuralmagic>
---
 ...-3-8B-Instruct-FP8-compressed-tensors.yaml |  11 +
 .../configs/Meta-Llama-3-8B-Instruct-FP8.yaml |   2 +-
 ...3-8B-Instruct-INT8-compressed-tensors.yaml |  11 +
 .../lm-eval-harness/configs/models-small.txt  |   2 +
 .../run-lm-eval-gsm-vllm-baseline.sh          |   2 +-
 .../test_lm_eval_correctness.py               |   3 +-
 tests/quantization/test_compressed_tensors.py |  32 ++-
 .../compressed_tensors/compressed_tensors.py  |  58 +++-
 .../compressed_tensors/schemes/__init__.py    |  27 +-
 .../schemes/compressed_tensors_w8a8.py        | 109 -------
 .../schemes/compressed_tensors_w8a8_fp8.py    |  87 ++++++
 .../schemes/compressed_tensors_w8a8_int8.py   |  85 ++++++
 .../quantization/compressed_tensors/utils.py  |   1 +
 .../model_executor/layers/quantization/fp8.py | 269 +++---------------
 .../layers/quantization/gptq_marlin.py        |  14 +-
 .../layers/quantization/utils/marlin_utils.py |  99 ++++++-
 .../layers/quantization/utils/w8a8_utils.py   | 163 +++++++++++
 17 files changed, 603 insertions(+), 372 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/w8a8_utils.py

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
new file mode 100644
index 000000000..e40f42a17
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.752
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
index 02668702b..7a89e8e0c 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -1,4 +1,4 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
 - name: "gsm8k"
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 000000000..bc2900298
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.728
+  - name: "exact_match,flexible-extract"
+    value: 0.728
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 273c5482d..3300ca64f 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -1,2 +1,4 @@
 Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8.yaml
+Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index a2876bade..933733e9c 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true \
   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
   --batch_size $BATCH_SIZE
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 975841dad..7fdce7b53 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -24,7 +24,8 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
 
 def launch_lm_eval(eval_config):
     model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={TP_SIZE}"
+                 f"tensor_parallel_size={TP_SIZE}," \
+                 f"add_bos_token=true"
 
     results = lm_eval.simple_evaluate(
         model="vllm",
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 4cdda97dc..96223a247 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -9,7 +9,8 @@ import torch
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8, CompressedTensorsWNA16)
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     QuantizationType)
 
@@ -37,12 +38,11 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
                           CompressedTensorsLinearMethod)
         assert isinstance(down_proj.quant_method,
                           CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
 
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.scheme.is_static_input_scheme
-        expected_type = (torch.int8 if quant_type == QuantizationType.INT else
-                         torch.float8_e4m3fn)
+        expected_type = torch.int8
 
         assert qkv_proj.weight.dtype is expected_type
         assert o_proj.weight.dtype is expected_type
@@ -79,7 +79,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
         qkv_proj = layer.self_attn.qkv_proj
 
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
         assert not qkv_proj.scheme.is_static_input_scheme
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
@@ -123,3 +123,25 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
         sampling_params = SamplingParams()
         output = llm.generate("Hello world!", sampling_params=sampling_params)
         assert output
+
+
+def test_compressed_tensors_fp8(vllm_runner):
+    model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
+    with vllm_runner(model_path) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
+        assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+        assert qkv_proj.input_scale.dtype is torch.float32
+        assert qkv_proj.weight_scale.dtype is torch.float32
+        # should be scalars after processing
+        assert len(qkv_proj.input_scale.shape) == 0
+        assert len(qkv_proj.weight_scale.shape) == 0
+
+        sampling_params = SamplingParams()
+        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 8ca486d95..c711fd14c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -9,10 +9,11 @@ from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8, CompressedTensorsWNA16)
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
-    find_first_name_or_class_match)
+    QuantizationType, find_first_name_or_class_match)
 from vllm.platforms import current_platform
 
 
@@ -117,6 +118,40 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         return is_8_bits and is_token and is_symmetric and is_dynamic
 
+    def _is_fp8_w8a8(self, weight_quant: BaseModel,
+                     input_quant: BaseModel) -> bool:
+        # Confirm weights and activations quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm we have floating points.
+        if not (weight_quant.type == QuantizationType.FLOAT
+                and input_quant.type == QuantizationType.FLOAT):
+            return False
+
+        # Confirm weight scheme is supported.
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_weight = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR)
+        if not (is_symmetric_weight and is_static_weight
+                and is_per_tensor_weight):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.dynamic:
+            return True
+
+        # Confirm activation scheme is supported.
+        is_symmetric_activation = input_quant.symmetric
+        is_per_tensor_activation = (
+            input_quant.strategy == QuantizationStrategy.TENSOR)
+        if not (is_symmetric_activation and is_per_tensor_activation):
+            return False
+
+        # All conditions satisfied.
+        return True
+
     def _is_wNa16_group_channel(self, weight_quant: BaseModel,
                                 input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
@@ -147,14 +182,21 @@ class CompressedTensorsConfig(QuantizationConfig):
                     strategy=weight_quant.strategy,
                     group_size=weight_quant.group_size)
 
-        if self.quant_format == CompressionFormat.int_quantized.value:
+        if (self.quant_format == CompressionFormat.int_quantized.value or
+                self.quant_format == CompressionFormat.float_quantized.value):
+            if self._is_fp8_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8Fp8(
+                    input_dynamic=input_quant.dynamic)
+
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8(strategy=weight_quant.strategy,
-                                             is_static_input_scheme=True)
+                return CompressedTensorsW8A8Int8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=True)
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8(strategy=weight_quant.strategy,
-                                             is_static_input_scheme=False)
+                return CompressedTensorsW8A8Int8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=False)
 
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
@@ -187,7 +229,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
         self.quantization_config = quantization_config
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        return layer.scheme.process_weights_after_loading(layer)
+        layer.scheme.process_weights_after_loading(layer)
 
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 720b8c263..dd94c4982 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,8 +1,19 @@
-from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
-from .compressed_tensors_unquantized import (  # noqa: F401
-    CompressedTensorsUnquantized)
-from .compressed_tensors_w4a16_24 import (  # noqa: F401
-    W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24)
-from .compressed_tensors_w8a8 import CompressedTensorsW8A8  # noqa: F401
-from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS  # noqa: F401
-from .compressed_tensors_wNa16 import CompressedTensorsWNA16  # noqa: F401
+from .compressed_tensors_scheme import CompressedTensorsScheme
+from .compressed_tensors_unquantized import CompressedTensorsUnquantized
+from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
+                                          CompressedTensorsW4A16Sparse24)
+from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
+from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
+from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
+                                       CompressedTensorsWNA16)
+
+__all__ = [
+    "CompressedTensorsScheme",
+    "CompressedTensorsUnquantized",
+    "CompressedTensorsWNA16",
+    "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8",
+    "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS",
+    "W4A16SPARSE24_SUPPORTED_BITS",
+]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
deleted file mode 100644
index dffe2a284..000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from typing import Callable, List, Tuple, Union
-
-import torch
-from torch.nn import Parameter
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    QuantizationStrategy)
-from vllm.model_executor.utils import set_weight_attrs
-
-
-class CompressedTensorsW8A8(CompressedTensorsScheme):
-
-    def __init__(self, strategy: str, is_static_input_scheme: bool):
-        self.strategy = strategy
-        self.is_static_input_scheme = is_static_input_scheme
-
-    # Cutlass kernels support only per-tensor and per-channel cases.
-    # So if we have a fused module (QKV, MLP) with per tensor scales (thus N
-    # scales being passed to the kernel), we convert to the per-channel case.
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if (self.strategy == QuantizationStrategy.TENSOR
-                and len(self.logical_widths) > 1):
-
-            # Load the N per-tensor scales into the channelwise buffer.
-            weight_scale_channel = torch.empty(
-                (sum(self.logical_widths), 1),
-                dtype=torch.float32,
-                device=layer.weight_scale.device)
-            start = 0
-            for idx, logical_width in enumerate(self.logical_widths):
-                end = start + logical_width
-                weight_scale_channel[start:end, :] = layer.weight_scale[idx]
-                start = end
-
-            layer.weight_scale = Parameter(weight_scale_channel,
-                                           requires_grad=False)
-
-        # transpose weights for cutlass.
-        weight = layer.weight
-        layer.weight = Parameter(weight.t(), requires_grad=False)
-
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-        self.logical_widths = output_partition_sizes
-
-        # WEIGHT SCALE
-        shape: Union[Tuple[int], Tuple[int, int]]
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            shape = (sum(self.logical_widths), 1)
-        else:
-            shape = (len(self.logical_widths), )
-
-        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
-                                 requires_grad=False)
-        layer.register_parameter("weight_scale", weight_scale)
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            set_weight_attrs(weight_scale, {
-                "weight_loader": weight_loader,
-                "output_dim": 0,
-            })
-        else:
-            set_weight_attrs(weight_scale, {
-                "weight_loader": weight_loader,
-                "needs_scalar_to_array": True,
-            })
-
-        # WEIGHT
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.int8),
-                           requires_grad=False)
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            "input_dim": 1,
-            "output_dim": 0,
-            "weight_loader": weight_loader,
-        })
-
-        # INPUT SCALE
-        # Static quantization:  load from disk.
-        if self.is_static_input_scheme:
-            input_scale = Parameter(torch.empty(1, dtype=torch.float32),
-                                    requires_grad=False)
-            layer.register_parameter("input_scale", input_scale)
-            set_weight_attrs(input_scale, {
-                "weight_loader": weight_loader,
-                "ignore_warning": True,
-            })
-        # Dynamic quantization: set to None.
-        else:
-            layer.input_scale = None
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        # ops.scaled_int8_quant supports both dynamic and static quant.
-        # * dynamic, layer.input_scale is None and x_scale computed from x.
-        # * static, layer.input_scale is scalar and x_scale is input_scale.
-        x_q, x_scale = ops.scaled_int8_quant(x, layer.input_scale)
-
-        return ops.cutlass_scaled_mm(x_q,
-                                     layer.weight,
-                                     scale_a=x_scale,
-                                     scale_b=layer.weight_scale,
-                                     out_dtype=x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
new file mode 100644
index 000000000..b93425fb2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -0,0 +1,87 @@
+from typing import Callable, List, Optional
+
+import torch
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear, create_per_tensor_scale_param, cutlass_fp8_supported,
+    requantize_with_max_scale)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW8A8Fp8"]
+
+
+class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
+
+    def __init__(self, input_dynamic: bool):
+        self.input_dynamic = input_dynamic
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
+    # So if we have a fused module (QKV, MLP) with per tensor scales (thus N
+    # scales being passed to the kernel), we requantize with a single scale.
+    def process_weights_after_loading(self, layer) -> None:
+        # Dequant -> Quant with max scale.
+        max_w_scale, weight = requantize_with_max_scale(
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            logical_widths=layer.logical_widths,
+        )
+
+        # Update layer with new values.
+        layer.weight = torch.nn.Parameter(weight.t(), requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(max_w_scale,
+                                                requires_grad=False)
+        if self.input_dynamic:
+            layer.input_scale = None
+        else:
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.max(),
+                                                   requires_grad=False)
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        del params_dtype
+
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = torch.nn.Parameter(torch.empty(output_size_per_partition,
+                                                input_size_per_partition,
+                                                dtype=torch.float8_e4m3fn),
+                                    requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            "weight_loader": weight_loader,
+        })
+
+        # WEIGHT SCALE
+        weight_scale = create_per_tensor_scale_param(
+            output_partition_sizes, weight_loader=weight_loader)
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if not self.input_dynamic:
+            input_scale = create_per_tensor_scale_param(
+                output_partition_sizes, weight_loader=weight_loader)
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
new file mode 100644
index 000000000..e70504ec5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -0,0 +1,85 @@
+from typing import Callable, List
+
+import torch
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_int8_linear, convert_to_channelwise, create_per_channel_scale_param,
+    create_per_tensor_scale_param)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = layer.weight
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(self.logical_widths) > 1
+        if is_fused_module and self.strategy == QuantizationStrategy.TENSOR:
+            ws_channelwise = convert_to_channelwise(layer.weight_scale,
+                                                    self.logical_widths)
+            layer.weight_scale = Parameter(ws_channelwise, requires_grad=False)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            layer.input_scale = Parameter(layer.input_scale.max(),
+                                          requires_grad=False)
+        else:
+            layer.input_scale = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        self.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.int8),
+                           requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            "weight_loader": weight_loader,
+        })
+
+        # WEIGHT SCALE
+        layer_kwargs = {"weight_loader": weight_loader}
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            scale = create_per_channel_scale_param(output_partition_sizes,
+                                                   **layer_kwargs)
+        else:
+            assert self.strategy == QuantizationStrategy.TENSOR
+            scale = create_per_tensor_scale_param(output_partition_sizes,
+                                                  **layer_kwargs)
+        layer.register_parameter("weight_scale", scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            scale = create_per_tensor_scale_param(output_partition_sizes,
+                                                  **layer_kwargs)
+            layer.register_parameter("input_scale", scale)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        return apply_int8_linear(input=x,
+                                 weight=layer.weight,
+                                 weight_scale=layer.weight_scale,
+                                 input_scale=layer.input_scale)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index b2bec9b60..5b44c2155 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -9,6 +9,7 @@ from torch.nn import Module
 class CompressionFormat(Enum):
     dense = "dense"
     sparse_bitmask = "sparse-bitmask"
+    float_quantized = "float-quantized"
     int_quantized = "int-quantized"
     pack_quantized = "pack-quantized"
     marlin_24 = "marlin-24"
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 544774891..8dba9019f 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 
 import torch
 from torch.nn import Module
@@ -11,11 +11,11 @@ from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState,
-    marlin_permute_scales)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    pack_fp8_to_int32)
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    all_close_1d, apply_fp8_linear, create_per_tensor_scale_param,
+    cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import print_warning_once
@@ -25,13 +25,6 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 logger = init_logger(__name__)
 
 
-def cutlass_fp8_supported() -> bool:
-    capability = current_platform.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-
-    return ops.cutlass_scaled_mm_supports_fp8(capability)
-
-
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
 
@@ -117,23 +110,6 @@ class Fp8LinearMethod(LinearMethodBase):
         capability = capability[0] * 10 + capability[1]
         self.use_marlin = capability < 89
 
-    def _create_scale_param(
-        self,
-        scale_name: str,
-        layer: torch.nn.Module,
-        output_partition_sizes: List[int],
-        **extra_weight_attrs,
-    ) -> None:
-        scale = Parameter(torch.empty(len(output_partition_sizes),
-                                      dtype=torch.float32),
-                          requires_grad=False)
-        scale[:] = torch.finfo(torch.float8_e4m3fn).min
-        layer.register_parameter(scale_name, scale)
-        set_weight_attrs(scale, {
-            **extra_weight_attrs,
-            "needs_scalar_to_array": True,
-        })
-
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -147,7 +123,6 @@ class Fp8LinearMethod(LinearMethodBase):
         del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
 
-        layer.process_after_load = True
         layer.logical_widths = output_partition_sizes
 
         layer.input_size_per_partition = input_size_per_partition
@@ -173,144 +148,50 @@ class Fp8LinearMethod(LinearMethodBase):
         # Otherwise, wait until process_weights_after_loading.
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
-            self._create_scale_param(
-                scale_name="weight_scale",
-                layer=layer,
-                output_partition_sizes=output_partition_sizes,
-                **extra_weight_attrs)
+            scale = create_per_tensor_scale_param(output_partition_sizes,
+                                                  **extra_weight_attrs)
+            layer.register_parameter("weight_scale", scale)
 
             # INPUT ACTIVATION SCALE
             if self.quant_config.activation_scheme == "static":
-                self._create_scale_param(
-                    scale_name="input_scale",
-                    layer=layer,
-                    output_partition_sizes=output_partition_sizes,
-                    **extra_weight_attrs)
-
-        # For GPUs without FP8 hardware support, we use Marlin for fast
-        # fused dequantization
-        if self.use_marlin:
-            layer.marlin_state = GPTQMarlinState.REPACK
-
-    def prepare_layer_for_marlin(self, layer: Module) -> None:
-        print_warning_once(
-            "Your GPU does not have native support for FP8 computation but "
-            "FP8 quantization is being used. Weight-only FP8 compression will "
-            "be used leveraging the Marlin kernel. This may degrade "
-            "performance for compute-heavy workloads.")
-
-        part_size_n = layer.output_size_per_partition
-        part_size_k = layer.input_size_per_partition
-
-        assert layer.marlin_state == GPTQMarlinState.REPACK
-        layer.marlin_state = GPTQMarlinState.READY
-
-        device = layer.weight.device
-
-        # WEIGHTS
-        # Repack weights to gptq format (packed int32 elements)
-        packed_gptq_qweight = pack_fp8_to_int32(layer.weight)
-
-        # Repack weights to marlin format
-        marlin_qweight = ops.gptq_marlin_repack(
-            b_q_weight=packed_gptq_qweight,
-            perm=torch.empty(0, dtype=torch.int, device=device),
-            size_k=part_size_k,
-            size_n=part_size_n,
-            num_bits=8,
-        )
-        layer.weight = Parameter(marlin_qweight, requires_grad=False)
-
-        # WEIGHT SCALES
-        # Currently Marlin doesn't support per-tensor scales, so we
-        # expand it to channelwise
-        scales = layer.weight_scale.repeat(1, part_size_n).to(
-            layer.orig_dtype).to(device)
-        # Permute scales
-        marlin_scales = marlin_permute_scales(
-            s=scales,
-            size_k=part_size_k,
-            size_n=part_size_n,
-            group_size=-1,
-            num_bits=8,
-        )
-        layer.weight_scale = Parameter(marlin_scales, requires_grad=False)
-
-        # Allocate marlin workspace
-        max_workspace_size = (
-            part_size_n // GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
-        workspace = torch.zeros(max_workspace_size,
-                                dtype=torch.int,
-                                device=device,
-                                requires_grad=False)
-
-        layer.workspace = workspace
+                scale = create_per_tensor_scale_param(output_partition_sizes,
+                                                      **extra_weight_attrs)
+                layer.register_parameter("input_scale", scale)
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        if (not hasattr(layer, "process_after_load")
-                or not layer.process_after_load):
-            return
-
-        # If checkpoint is fp/bf16 (not serialized fp8), quantize the weights.
+        # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
                                                          scale=None)
+
+            # Update the layer with the new values.
             layer.weight = Parameter(qweight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            layer.logical_widths = None
             layer.input_scale = None
-            if self.use_marlin:
-                self.prepare_layer_for_marlin(layer)
-            return
 
         # If checkpoint is fp8, requantize the separately quantized logical
         # weights into a single fp8 weight with a single weight scale.
         else:
-            # WEIGHT_SCALE / WEIGHT
-            #   Loop over logical weights, requantizing with single scale.
-            max_w_scale = layer.weight_scale.max()
-
-            # QKV / MLP is fused in the on disk checkpoint if any of the
-            # weight scales are still set to the default since we initialize
-            # N weight scales for N shards but we only load 1 weight scale
-            # from disk in this case. As a result, we skip dequant -> requant
-            # since we already have quantized QKV together.
-            # Sample Model with fused checkpoint:
-            #   * nm-testing/Phi-3-mini-128k-instruct-FP8
-            unfused_module_in_checkpoint = (
-                layer.weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min)
-
-            if unfused_module_in_checkpoint:
-                start = 0
-                for idx, logical_width in enumerate(layer.logical_widths):
-                    end = start + logical_width
-                    weight_dq = per_tensor_dequantize(
-                        layer.weight[start:end, :], layer.weight_scale[idx])
-
-                    layer.weight[start:end, :] = per_tensor_quantize(
-                        weight_dq, layer.weight_scale.max())
-                    start = end
-            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+            # Dequant -> Quant with max scale.
+            max_w_scale, weight = requantize_with_max_scale(
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                logical_widths=layer.logical_widths,
+            )
 
-            # WEIGHT
-            #   Transpose weight for passing to torch._scaled_mm
-            weight = layer.weight
+            # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
-
-            # INPUT ACTIVATION SCALE
-            #   Dynamic: set to None (required input to ops.scaled_fp8_quant).
-            #   Static:  set to max of the input_scales (since they are equal).
-            if self.quant_config.activation_scheme == "dynamic":
-                layer.input_scale = None
-            elif self.quant_config.activation_scheme == "static":
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+            if self.quant_config.activation_scheme == "static":
                 layer.input_scale = Parameter(layer.input_scale.max(),
                                               requires_grad=False)
             else:
-                raise ValueError(
-                    f"Unknown scheme {self.quant_config.activation_scheme}")
+                layer.input_scale = None
 
-            if self.use_marlin:
-                self.prepare_layer_for_marlin(layer)
+        if self.use_marlin:
+            prepare_fp8_layer_for_marlin(layer)
+            # Activations not quantized for marlin.
+            del layer.input_scale
 
     def apply(self,
               layer: torch.nn.Module,
@@ -318,65 +199,22 @@ class Fp8LinearMethod(LinearMethodBase):
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
         if self.use_marlin:
-            # For GPUs that lack FP8 hardware support, we can leverage the
-            # Marlin kernel for fast weight-only FP8 quantization
-
-            reshaped_x = x.reshape(-1, x.shape[-1])
-            out_shape = x.shape[:-1] + (layer.output_size_per_partition, )
-
-            output = ops.fp8_marlin_gemm(
-                a=reshaped_x,
-                b_q_weight=layer.weight,
-                b_scales=layer.weight_scale,
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
                 workspace=layer.workspace,
-                num_bits=8,
-                size_m=reshaped_x.shape[0],
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
-            )
-
-            if bias is not None:
-                output.add_(bias)  # In-place add
-
-            return output.reshape(out_shape)
-
-        else:
-
-            # ops.scaled_fp8_quant supports both dynamic and static quant.
-            # If dynamic, layer.input_scale is None and x_scale computed from x
-            # If static, layer.input_scale is scalar and x_scale is input_scale
+                bias=bias)
 
-            if bias is None and self.cutlass_fp8_supported:
-                qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
-
-                # Fused GEMM_DQ
-                output = ops.cutlass_scaled_mm(
-                    qinput,
-                    layer.weight,
-                    out_dtype=x.dtype,
-                    scale_a=x_scale,
-                    scale_b=layer.weight_scale,
-                )
-
-            else:
-                qinput, x_scale = ops.scaled_fp8_quant(x,
-                                                       layer.input_scale,
-                                                       batch_dim_padding=17)
-
-                # Fused GEMM_DQ -- note we padded the input above because
-                # torch._scaled_mm is more performant for matrices with
-                # batch dimension > 16. Note that this could change
-                # in the future.
-                output, _ = torch._scaled_mm(
-                    qinput,
-                    layer.weight,
-                    out_dtype=x.dtype,
-                    scale_a=x_scale,
-                    scale_b=layer.weight_scale,
-                    bias=bias,
-                )
-
-        return torch.narrow(output, 0, 0, x.shape[0])
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):
@@ -399,8 +237,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                        intermediate_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
 
-        layer.process_after_load = True
-
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = torch.float8_e4m3fn
 
@@ -465,9 +301,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.a2_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        if (not hasattr(layer, "process_after_load")
-                or not layer.process_after_load):
-            return
 
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
@@ -531,7 +364,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                                                     shard_size, :],
                         layer.w13_scale[expert_id][shard_id])
                     layer.w13_weight[expert_id][
-                        start:start + shard_size, :] = per_tensor_quantize(
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
                             dq_weight, max_w13_scales[expert_id])
                     start += shard_size
 
@@ -596,23 +429,3 @@ class Fp8KVCacheMethod(QuantizeMethodBase):
                     "cause accuracy issues. Please make sure kv-cache scaling "
                     "factor is available in the fp8 checkpoint.")
         del layer.kv_scale
-
-
-def per_tensor_quantize(tensor: torch.Tensor,
-                        inv_scale: Union[float, torch.Tensor]) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
-    return qweight.to(torch.float8_e4m3fn)
-
-
-def per_tensor_dequantize(
-        tensor: torch.Tensor, inv_scale: Union[float,
-                                               torch.Tensor]) -> torch.Tensor:
-    fake_qweight = tensor.to(torch.float16)
-    dq_weight = fake_qweight * inv_scale
-    return dq_weight
-
-
-def all_close_1d(x: torch.Tensor) -> bool:
-    assert len(x.shape) == 1
-    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a6284d0ed..6b971f73d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,20 +11,16 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_K,
+    GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_SUPPORTED_GROUP_SIZES,
+    GPTQ_MARLIN_SUPPORTED_NUM_BITS, GPTQ_MARLIN_SUPPORTED_SYM,
+    GPTQ_MARLIN_TILE)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
-GPTQ_MARLIN_TILE = 16
-GPTQ_MARLIN_MIN_THREAD_N = 64
-GPTQ_MARLIN_MIN_THREAD_K = 128
-GPTQ_MARLIN_MAX_PARALLEL = 16
-
-GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
-GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
-GPTQ_MARLIN_SUPPORTED_SYM = [True]
-
 
 # Permutations for Marlin scale shuffling
 def get_scale_perms(num_bits: int):
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 66ce19592..988624526 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,9 +1,11 @@
 """This file is used for /tests and /benchmarks"""
 import random
+from typing import Optional
 
 import numpy
 import torch
 
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.format_24 import (
     mask_creator, sparse_semi_structured_from_dense_cutlass)
 from vllm.model_executor.layers.quantization.utils.marlin_24_perms import (
@@ -13,8 +15,16 @@ from vllm.model_executor.layers.quantization.utils.marlin_perms import (
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     get_pack_factor, quantize_weights, sort_weights)
 from vllm.platforms import current_platform
+from vllm.utils import print_warning_once
 
-MARLIN_TILE = 16
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+
+GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
+GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+GPTQ_MARLIN_SUPPORTED_SYM = [True]
 
 
 def is_marlin_supported():
@@ -22,7 +32,92 @@ def is_marlin_supported():
     return capability[0] >= 8
 
 
-def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE):
+def apply_fp8_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    workspace: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    # For GPUs that lack FP8 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP8 quantization
+
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+
+    output = ops.fp8_marlin_gemm(
+        a=reshaped_x,
+        b_q_weight=weight,
+        b_scales=weight_scale,
+        workspace=workspace,
+        num_bits=8,
+        size_m=reshaped_x.shape[0],
+        size_n=size_n,
+        size_k=size_k,
+    )
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
+    print_warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+
+    device = layer.weight.device
+
+    # WEIGHTS
+    # Repack weights to gptq format (packed int32 elements)
+    packed_gptq_qweight = pack_fp8_to_int32(layer.weight)
+
+    # Repack weights to marlin format
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=packed_gptq_qweight,
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=part_size_k,
+        size_n=part_size_n,
+        num_bits=8,
+    )
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+
+    # WEIGHT SCALES
+    # Currently Marlin doesn't support per-tensor scales, so we
+    # expand it to channelwise
+    scales = layer.weight_scale.repeat(1, part_size_n).to(
+        layer.orig_dtype).to(device)
+    # Permute scales
+    num_bits = 8
+    marlin_scales = marlin_permute_scales(
+        s=scales,
+        size_k=part_size_k,
+        size_n=part_size_n,
+        group_size=-1,
+        scale_perm=marlin_scale_perm[num_bits],
+        scale_perm_single=marlin_scale_perm_single[num_bits])
+    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+
+    # Allocate marlin workspace
+    max_workspace_size = (part_size_n //
+                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device=device,
+                            requires_grad=False)
+
+    layer.workspace = workspace
+
+
+def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
     assert q_w.shape == (size_k, size_n)
     assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
     assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
new file mode 100644
index 000000000..81b7fdb78
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -0,0 +1,163 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+
+def cutlass_fp8_supported() -> bool:
+    capability = current_platform.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+
+    return ops.cutlass_scaled_mm_supports_fp8(capability)
+
+
+def per_tensor_dequantize(
+        tensor: torch.Tensor, inv_scale: Union[float,
+                                               torch.Tensor]) -> torch.Tensor:
+    fake_qweight = tensor.to(torch.float16)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+
+
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
+
+
+def create_per_tensor_scale_param(
+    output_partition_sizes: List[int],
+    **extra_weight_attrs,
+) -> Parameter:
+    scale = Parameter(torch.empty(len(output_partition_sizes),
+                                  dtype=torch.float32),
+                      requires_grad=False)
+    scale[:] = torch.finfo(torch.float32).min
+    set_weight_attrs(scale, {
+        "needs_scalar_to_array": True,
+        **extra_weight_attrs
+    })
+    return scale
+
+
+def create_per_channel_scale_param(output_partition_sizes: List[int],
+                                   **extra_weight_attrs) -> Parameter:
+    scale = Parameter(torch.empty((sum(output_partition_sizes), 1),
+                                  dtype=torch.float32),
+                      requires_grad=False)
+    scale[:] = torch.finfo(torch.float32).min
+    set_weight_attrs(scale, {"output_dim": 0, **extra_weight_attrs})
+    return scale
+
+
+def convert_to_channelwise(
+        weight_scale: torch.Tensor,
+        logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Create channelwise buffer
+    weight_scale_channel = torch.empty((sum(logical_widths), 1),
+                                       dtype=torch.float32,
+                                       device=weight_scale.device)
+
+    # Expand each scale to match the size of each logical matrix.
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_scale_channel[start:end, :] = weight_scale[idx]
+        start = end
+
+    return weight_scale_channel
+
+
+def requantize_with_max_scale(
+        weight: torch.Tensor, weight_scale: torch.Tensor,
+        logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max()
+
+    # QKV / MLP is fused in the on disk checkpoint if any of the
+    # weight scales are still set to the default since we initialize
+    # N weight scales for N shards but we only load 1 weight scale
+    # from disk in this case. Skip requantization in this case (since)
+    # we already are quantized with the single scale.
+    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
+        torch.float8_e4m3fn).min)
+
+    # If unfused checkpoint, need requanize with the single scale.
+    if unfused_module_in_checkpoint:
+        start = 0
+        for idx, logical_width in enumerate(logical_widths):
+            end = start + logical_width
+            weight_dq = per_tensor_dequantize(weight[start:end, :],
+                                              weight_scale[idx])
+            weight[start:end, :], _ = ops.scaled_fp8_quant(
+                weight_dq, max_w_scale)
+            start = end
+
+    return max_w_scale, weight
+
+
+def apply_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    cutlass_fp8_supported: bool = True,
+) -> torch.Tensor:
+    # ops.scaled_fp8_quant supports both dynamic and static quant.
+    #   If dynamic, layer.input_scale is None and x_scale computed from x.
+    #   If static, layer.input_scale is scalar and x_scale is input_scale.
+
+    if bias is None and cutlass_fp8_supported:
+        qinput, x_scale = ops.scaled_fp8_quant(input, input_scale)
+
+        # Fused GEMM_DQ
+        output = ops.cutlass_scaled_mm(qinput,
+                                       weight,
+                                       out_dtype=input.dtype,
+                                       scale_a=x_scale,
+                                       scale_b=weight_scale)
+
+    else:
+        qinput, x_scale = ops.scaled_fp8_quant(input,
+                                               input_scale,
+                                               batch_dim_padding=17)
+
+        # Fused GEMM_DQ -- note we padded the input above because
+        # torch._scaled_mm is more performant for matrices with
+        # batch dimension > 16. Note that this could change
+        # in the future.
+        output, _ = torch._scaled_mm(qinput,
+                                     weight,
+                                     out_dtype=input.dtype,
+                                     scale_a=x_scale,
+                                     scale_b=weight_scale,
+                                     bias=bias)
+
+    return torch.narrow(output, 0, 0, input.shape[0])
+
+
+def apply_int8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+):
+    if bias is not None:
+        raise NotImplementedError("W8A8 with int8 does not yet support bias.")
+
+    # ops.scaled_int8_quant supports both dynamic and static quant.
+    # * dynamic, layer.input_scale is None and x_scale computed from x.
+    # * static, layer.input_scale is scalar and x_scale is input_scale.
+    x_q, x_scale = ops.scaled_int8_quant(input, input_scale)
+
+    return ops.cutlass_scaled_mm(x_q,
+                                 weight,
+                                 scale_a=x_scale,
+                                 scale_b=weight_scale,
+                                 out_dtype=input.dtype)
-- 
GitLab


From 3b08fe2b13ced7fe76abe17c99614dd36e4b4788 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 7 Jul 2024 15:11:12 -0700
Subject: [PATCH 281/376] [misc][frontend] log all available endpoints (#6195)

Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/entrypoints/api_server.py        | 11 +++++++++++
 vllm/entrypoints/openai/api_server.py |  8 ++++++++
 2 files changed, 19 insertions(+)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index feb904c5a..66941442c 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -16,10 +16,13 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 
+logger = init_logger("vllm.entrypoints.api_server")
+
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 app = FastAPI()
 engine = None
@@ -107,6 +110,14 @@ if __name__ == "__main__":
         engine_args, usage_context=UsageContext.API_SERVER)
 
     app.root_path = args.root_path
+
+    logger.info("Available routes are:")
+    for route in app.routes:
+        if not hasattr(route, 'methods'):
+            continue
+        methods = ', '.join(route.methods)
+        logger.info("Route: %s, Methods: %s", route.path, methods)
+
     uvicorn.run(app,
                 host=args.host,
                 port=args.port,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 76879c96c..d3ed1ec7a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -240,6 +240,14 @@ if __name__ == "__main__":
     openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
                                                       served_model_names)
     app.root_path = args.root_path
+
+    logger.info("Available routes are:")
+    for route in app.routes:
+        if not hasattr(route, 'methods'):
+            continue
+        methods = ', '.join(route.methods)
+        logger.info("Route: %s, Methods: %s", route.path, methods)
+
     uvicorn.run(app,
                 host=args.host,
                 port=args.port,
-- 
GitLab


From 16620f439db1f2cc91b5582b59fc8845cbb02881 Mon Sep 17 00:00:00 2001
From: kczimm <4733573+kczimm@users.noreply.github.com>
Date: Sun, 7 Jul 2024 21:32:57 -0500
Subject: [PATCH 282/376] do not exclude `object` field in
 CompletionStreamResponse (#6196)

---
 vllm/entrypoints/openai/serving_completion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 55cd01579..9c719d634 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -301,7 +301,7 @@ class OpenAIServingCompletion(OpenAIServing):
                         else:
                             chunk.usage = None
 
-                    response_json = chunk.model_dump_json(exclude_unset=True)
+                    response_json = chunk.model_dump_json(exclude_unset=False)
                     yield f"data: {response_json}\n\n"
 
             if (request.stream_options
@@ -314,7 +314,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     usage=usage,
                 )
                 final_usage_data = (final_usage_chunk.model_dump_json(
-                    exclude_unset=True, exclude_none=True))
+                    exclude_unset=False, exclude_none=True))
                 yield f"data: {final_usage_data}\n\n"
 
         except ValueError as e:
-- 
GitLab


From 717f4bcea036a049e86802b3a05dd6f7cd17efc8 Mon Sep 17 00:00:00 2001
From: Haichuan <1778876540@qq.com>
Date: Mon, 8 Jul 2024 15:52:06 +0800
Subject: [PATCH 283/376] Feature/add benchmark testing (#5947)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 benchmarks/benchmark_serving.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index dbcb9743b..7ba977141 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -483,10 +483,10 @@ def main(args: argparse.Namespace):
 
     elif args.dataset_name == "random":
         input_requests = sample_random_requests(
-            input_len=args.input_len,
-            output_len=args.output_len,
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
             num_prompts=args.num_prompts,
-            range_ratio=args.range_ratio,
+            range_ratio=args.random_range_ratio,
             tokenizer=tokenizer,
         )
 
-- 
GitLab


From f7a8fa39d828136d0b8bbdd20c262602e5543ffd Mon Sep 17 00:00:00 2001
From: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Date: Mon, 8 Jul 2024 18:00:38 +0300
Subject: [PATCH 284/376] [Kernel] reloading fused_moe config on the last chunk
 (#6210)

---
 .../layers/fused_moe/fused_moe.py             | 51 +++++++++++++------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 99a5c7d78..a29622b7d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -332,6 +332,31 @@ def get_default_config(
     return config
 
 
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    override_config: Optional[Dict[str, Any]] = None,
+):
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        E, _, N = w2_shape
+        configs = get_moe_configs(E, N, dtype)
+
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype)
+    return config
+
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -428,22 +453,16 @@ def fused_experts(hidden_states: torch.Tensor,
     CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
     M = min(num_tokens, CHUNK_SIZE)
 
-    if override_config:
-        config = override_config
-    else:
-        # First try to load optimal config from the file
-        configs = get_moe_configs(E, w2.shape[2],
-                                  "float8" if use_fp8 else None)
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        "float8" if use_fp8 else None,
+        override_config=override_config,
+    )
 
-        if configs:
-            # If an optimal configuration map has been found, look up the
-            # optimal config
-            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
-        else:
-            # Else use the default config
-            config = get_default_config(M, E, N, w1.shape[2],
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None)
+    config = get_config_func(M)
 
     intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
                                       device=hidden_states.device,
@@ -478,6 +497,8 @@ def fused_experts(hidden_states: torch.Tensor,
             intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
             intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
             intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+            # reload config to get better performance on the last chunk
+            config = get_config_func(tokens_in_chunk)
 
         curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
         curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
-- 
GitLab


From 543aa4857362ff385af8a9f496392b8831c42833 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Mon, 8 Jul 2024 13:12:15 -0400
Subject: [PATCH 285/376] [Kernel] Correctly invoke prefill & decode kernels
 for cross-attention (towards eventual encoder/decoder model support) (#4888)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/kernels/test_attention_selector.py    |  14 +-
 tests/kernels/test_encoder_decoder_attn.py  | 953 ++++++++++++++++++++
 tests/kernels/utils.py                      | 920 +++++++++++++++++++
 vllm/attention/backends/abstract.py         |   8 +
 vllm/attention/backends/blocksparse_attn.py |   9 +-
 vllm/attention/backends/flash_attn.py       |   9 +-
 vllm/attention/backends/flashinfer.py       |   8 +-
 vllm/attention/backends/ipex_attn.py        |   8 +-
 vllm/attention/backends/pallas.py           |   8 +-
 vllm/attention/backends/rocm_flash_attn.py  |   9 +-
 vllm/attention/backends/torch_sdpa.py       |   8 +-
 vllm/attention/backends/utils.py            |   7 +
 vllm/attention/backends/xformers.py         | 472 ++++++++--
 vllm/attention/layer.py                     |  13 +-
 14 files changed, 2351 insertions(+), 95 deletions(-)
 create mode 100644 tests/kernels/test_encoder_decoder_attn.py
 create mode 100644 vllm/attention/backends/utils.py

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 8e6c50666..d9404e644 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -47,32 +47,32 @@ def test_flash_attn(monkeypatch):
     # Unsupported CUDA arch
     with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
         backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
-        assert backend.name != "FLASH_ATTN"
+        assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported data type
     backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported kv cache data type
     backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported block size
     backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported sliding window
     backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
     # flash-attn is not installed
     with patch.dict('sys.modules', {'vllm_flash_attn': None}):
         backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
-        assert backend.name != "FLASH_ATTN"
+        assert backend.name != STR_FLASH_ATTN_VAL
 
     # Unsupported head size
     backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
-    assert backend.name != "FLASH_ATTN"
+    assert backend.name != STR_FLASH_ATTN_VAL
 
 
 def test_invalid_env(monkeypatch):
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
new file mode 100644
index 000000000..f25e7d480
--- /dev/null
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -0,0 +1,953 @@
+"""
+Tests:
+
+* E2E test of Encoder attention + Decoder self-attention +
+      Encoder/decoder cross-attention (collectively
+      "encoder/decoder attention")
+* Confirm enc/dec models will fail for chunked prefill
+* Confirm enc/dec models will fail for prefix caching
+
+"""
+
+from typing import NamedTuple, Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import *
+from tests.kernels.utils import make_causal_mask, maybe_make_long_tensor
+from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionBackend, AttentionType
+from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
+from vllm.utils import is_hip
+
+HEAD_SIZES = [64, 256]
+
+NUM_HEADS = [1, 16]
+
+BATCH_SIZES = [1, 16]
+BLOCK_SIZES = [16]
+BACKEND_NAMES = [STR_XFORMERS_ATTN_VAL]
+CUDA_DEVICE = "cuda:0"
+
+MAX_DEC_SEQ_LENS = [128]
+MAX_ENC_SEQ_LENS = [128]
+
+# Narrow teest-cases for unsupported-scenario
+# tests
+HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
+
+
+class TestPoint(NamedTuple):
+    """
+    Encapsulates the attributes which define a single invocation
+    of the test_e2e_enc_dec_attn() test
+
+    Attributes:
+        num_heads: The number of heads in the model.
+        head_size: Head dimension
+        backend_name: Name of the backend framework used.
+        batch_size: Number of samples per batch.
+        block_size: Size of each block of data processed.
+        max_dec_seq_len: Maximum sequence length for the decoder.
+        max_enc_seq_len: Maximum sequence length for the encoder.
+        num_blocks: Number of blocks in the model.
+    """
+
+    num_heads: int
+    head_size: int
+    backend_name: str
+    batch_size: int
+    block_size: int
+    max_dec_seq_len: int
+    max_enc_seq_len: int
+    num_blocks: int
+
+
+class TestResources(NamedTuple):
+    '''
+    Encapsulates key components for performing an
+    encoder/decoder attention test
+
+    Note that
+    (1) attn automatically selects an attention backend
+        based on platform info & a set of canned
+        heuristics
+    (2) attn_backend is thus *not the same backend
+        instance* used by attn, but rather it is
+        intended to be a
+        *different instance* of the *same backend class*;
+        it is assumed that the user of TestResources
+        will leverage attn_backend for the purpose of
+        constructing backend-compatible attention
+        metadata instances
+   
+    Attributes:
+
+    * scale: 1/sqrt(d) scale factor for attn
+    * attn_backend: implementatino of abstraction
+                    attention interface using
+                    a particular kernel library
+                    i.e. XFormers
+    * attn: Attention layer instance
+    * kv_cache: shared key/value cache for all attention
+    '''
+
+    scale: float
+    attn_backend: AttentionBackend
+    attn: Attention
+    kv_cache: torch.Tensor
+
+
+def _make_test_resources(test_pt: TestPoint, ) -> TestResources:
+    '''
+    Build key components for performing encoder/decoder attention test.
+
+    Note that
+    (1) The Attention instance constructed here, automatically selects 
+        an attention backend class based on platform info & a set of canned
+        heuristics, so
+    (2) The attention backend instance constructed here is thus *not 
+        the same backend instance* used by attn, but rather it is
+        intended to be a *different instance* of the *same backend class*;
+        therefore,
+    (3) This function requires that test_pt.backend_name matches the backend
+        class that Attention will automatically select when it is constructed.
+
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: num_heads, head_size, num_blocks,
+               block_size, backend_name
+
+    Returns:
+
+    * TestResources data structure.
+    '''
+
+    scale = float(1.0 / (test_pt.head_size**0.5))
+    attn_backend = make_backend(test_pt.backend_name)
+    attn = Attention(
+        test_pt.num_heads,
+        test_pt.head_size,
+        scale=scale,
+    )
+    if test_pt.num_blocks is None or test_pt.num_heads is None:
+        # Caller does not require a KV cache
+        return TestResources(scale, attn_backend, attn, None)
+
+    # Construct KV cache
+    kv_cache = make_kv_cache(test_pt.num_blocks,
+                             test_pt.num_heads,
+                             test_pt.head_size,
+                             test_pt.block_size,
+                             device=CUDA_DEVICE)
+    return TestResources(scale, attn_backend, attn, kv_cache)
+
+
+def _encoder_attn_setup(
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+) -> PhaseTestParameters:
+    '''
+    Set up test vectors & data structures for encoder attention test.
+
+    A triplet of synthetic query/key/value tensors are constructed. 
+    Given this is an encoder attention test, the key & value
+    sequences will have the same length as the corresponding queries.
+
+    The query/key/value tensors are passed to an ideal reference
+    self-attention implementation to generate an ideal output tensor.
+
+    Encoder inference does not populate the KV cache, therefore
+    no KV cache memory mapping is constructed
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size, 
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+
+    
+    Returns:
+    
+    * PhaseTestParameters data structure comprising (1) packed query/key/value
+      tensors, (2) the ideal output of attention computed using a naive
+      implementation, and (3) KVCache field set to None
+    '''
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        _,
+        _,
+        max_q_seq_len,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    max_kv_seq_len = max_q_seq_len
+
+    # Make test tensors
+
+    qkv_in, _, _ = make_qkv(batch_size,
+                            max_q_seq_len,
+                            max_kv_seq_len,
+                            num_heads,
+                            head_size,
+                            attn_type=AttentionType.ENCODER,
+                            device=CUDA_DEVICE)
+
+    # Compute correct answer using naive non-causal attention
+    # implementation
+
+    ideal_output = ref_masked_attention(qkv_in.query,
+                                        qkv_in.key,
+                                        qkv_in.value,
+                                        scale=scale,
+                                        q_seq_lens=qkv_in.q_seq_lens,
+                                        kv_seq_lens=qkv_in.kv_seq_lens)
+
+    packed_ideal_output, _ = pack_tensor(ideal_output,
+                                         qkv_in.q_seq_lens,
+                                         device=CUDA_DEVICE)
+
+    packed_qkv = pack_qkv(qkv_in, device=CUDA_DEVICE)
+
+    return PhaseTestParameters(
+        PackedQKVO(packed_qkv, packed_ideal_output),
+        None  # No KV cache
+    )
+
+
+def _decoder_attn_setup(
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+    block_base_addr: int = 0,
+) -> Tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]:
+    '''
+    Set up test vectors & data structures for self-attention test.
+
+    A triplet of synthetic query/key/value tensors are constructed ("baseline"
+    query/key/value). Given this is a self-attention test, the key & value
+    sequences will have the same length as the corresponding queries.
+
+    "Prefill" query/key/value tensors are derived by masking out the last value
+    in each baseline query/key/value. These tensors are used to test prefill &
+    populate KV cache for a subsequent decode test.
+
+    "Decode" query/key/value tensors are derived by extracting *only* the last
+    value from each baseline query/key/value (i.e. complement of the prefill
+    tensors.) These tensors are used to test decode, conditional on the kv cache
+    being populated during the prefill test.
+
+    The baseline query/key/value tensors are passed to an ideal reference
+    self-attention implementation to generate a "Baseline" ideal output tensor.
+    This tensor is split into the "Prefill" ideal output tensor (all but the
+    last element of each output sequence) and the "Decode" ideal output tensor
+    (*only* the last element of each output sequence); the "Prefill" and
+    "Decode" ideal output tensors can be used to validate the prefill and decode
+    test results, respectively.
+
+    This function also constructs the self-attention KV cache memory mapping
+    (slot mapping and block table), ensuring that the block table starts at
+    block_base_addr
+
+    Arguments:
+
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size, 
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+    * block_base_addr: decoder self-attention block-table base address
+
+    Returns:
+    * qkv: Unpacked (batch_size x padded_seq_len x num_heads x
+           head_size) query/key/value tensors
+    * Prefill-phase decoder self-attention PhaseTestParameters data structure,
+      including (1) packed (number_of_tokens x num_heads x head_size) 
+      query/key/value tensors along with (2) ideal attention output
+      computed using a naive implementation, and (3) memory-mapping data 
+      structures appropriate for prefill phase.
+    * Decode-phase decoder self-attention PhaseTestParameters data structure, 
+      including (1) packed (number_of_tokens x num_heads x head_size) 
+      query/key/value tensors along with (2) ideal attention output 
+      computed using a naive implementation, and (3) memory-mapping data 
+      structures appropriate for decode phase.
+    * max_block_idx: max physical address in decoder self-attention block-table
+                     (intended to be used as the base address for the encoder/
+                      decoder cross-attention block-table, which is not
+                      constructed in this function)
+    '''
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        block_size,
+        max_q_seq_len,
+        _,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    max_kv_seq_len = max_q_seq_len
+
+    # Build test tensors
+
+    (
+        qkv,
+        prefill_qkv,
+        decode_qkv,
+    ) = make_qkv(batch_size,
+                 max_q_seq_len,
+                 max_kv_seq_len,
+                 num_heads,
+                 head_size,
+                 attn_type=AttentionType.DECODER,
+                 device=CUDA_DEVICE)
+
+    # Compute correct answer using naive attention implementation
+    # with causal attention mask
+
+    causal_mask = make_causal_mask(max_q_seq_len,
+                                   max_kv_seq_len).to(CUDA_DEVICE)
+
+    ideal_output = ref_masked_attention(qkv.query,
+                                        qkv.key,
+                                        qkv.value,
+                                        scale=scale,
+                                        custom_mask=causal_mask,
+                                        q_seq_lens=qkv.q_seq_lens,
+                                        kv_seq_lens=qkv.kv_seq_lens)
+
+    # Split out the prefill- & decode-phase ideal answers & pack them
+
+    prefill_ideal_output = torch.zeros_like(ideal_output)
+    decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
+    for bdx, prefill_q_seq_len in enumerate(prefill_qkv.q_seq_lens):
+        prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
+            bdx, :prefill_q_seq_len]
+        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
+            prefill_q_seq_len + 1)]
+
+    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
+                                                 prefill_qkv.q_seq_lens,
+                                                 device=CUDA_DEVICE)
+    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
+                                                [1 for _ in range(batch_size)],
+                                                device=CUDA_DEVICE)
+
+    # Build prefill- & decode-phase data structures
+    # for decoder self-attention. Block tables and
+    # slot mapping must be in a format compatible
+    # with KV caching & attention kernels
+    #
+    # Prefill-phase:
+    #
+    # * Empty block-tables tensor
+    # * Slot-mapping with entries for prompt tokens
+    #
+    # Decode-phase:
+    # * Block-tables tensor with minimum number of blocks
+    #   required by total num. tokens in the entirety of all sequences
+    #   (including both prefill & decode)
+    # * Slot-mapping with entries for tokens that will be decoded in the
+    #   current decode iteration
+    #
+    #  Note: the format described above is simply mirroring what ModelRunner
+    #        produces
+
+    prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
+
+    (
+        decode_block_tables,
+        slot_mapping_list,
+        max_block_idx,
+    ) = make_block_tables_slot_mapping(block_size,
+                                       qkv.q_seq_lens,
+                                       device=CUDA_DEVICE,
+                                       block_base_addr=block_base_addr)
+
+    (
+        prefill_slot_mapping,
+        decode_slot_mapping,
+    ) = split_slot_mapping(slot_mapping_list,
+                           qkv.q_seq_lens,
+                           device=CUDA_DEVICE)
+
+    prefill_pckd_qkv = pack_qkv(prefill_qkv, device=CUDA_DEVICE)
+
+    decode_pckd_qkv = pack_qkv(decode_qkv, device=CUDA_DEVICE)
+
+    return (
+        qkv,
+        PhaseTestParameters(  # Prefill test params
+            PackedQKVO(prefill_pckd_qkv, prefill_packed_ideal_output),
+            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
+        PhaseTestParameters(  # Decode test params
+            PackedQKVO(decode_pckd_qkv, decode_packed_ideal_output),
+            KVMemoryMap(decode_block_tables, decode_slot_mapping)),
+        max_block_idx)
+
+
+def _enc_dec_cross_attn_setup_reuses_query(
+    decoder_qkv: QKVInputs,
+    encoder_test_params: PhaseTestParameters,
+    prefill_decoder_phase_test_params: PhaseTestParameters,
+    test_pt: TestPoint,
+    test_rsrcs: TestResources,
+    block_base_addr: int = 0,
+) -> Tuple[PhaseTestParameters, PhaseTestParameters]:
+    '''
+    Set up test vectors & data structures for cross-attention test.
+
+    A triplet of synthetic cross-attention key/value tensors are constructed
+    ("baseline" key/value). Given this is a cross-attention test, we assume
+    query tensors were already synthesized for a prior self-attention test and
+    will be reused for cross-attention. The key & value sequences generated here
+    may have a different length than the corresponding queries (as is often
+    the case for cross-attention between decoder and encoder sequences.)
+
+    Cross attention key & value tensors do not grow during autoregressive
+    inference; thus this function obtains a single key/value pair suitable for
+    both prefill and decode.
+
+    The "baseline" query tensor is received as an argument. The "baseline"
+    query/key/value tensors are passed to an ideal reference cross-attention
+    implementation to generate a "baseline" ideal output tensor. This tensor is
+    split into the "Prefill" ideal output tensor (all but the last element of
+    each output sequence) and the "Decode" ideal output tensor (*only* the last
+    element of each output sequence); the "Prefill" and "Decode" ideal output
+    tensors can be used to validate the prefill and decode test results,
+    respectively.
+
+    This function also constructs the cross-attention KV cache memory mapping
+    (slot mapping and block table), ensuring that the block table starts at
+    block_base_addr. 
+
+    Arguments:
+
+    * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x
+                   num_heads x head_size) decoder self-attention inputs; 
+                   this function relies on the query and q_seq_lens
+                   fields
+    * encoder_test_params: PhaseTestParameters data structure which was
+                           used for encoder inference; KV cache field
+                           is not used by this function
+    * prefill_decoder_phase_test_params: PhaseTestParameters data structure
+                                         used for prefill-phase decoder
+                                         self-attention; all fields
+                                         including KV cache required
+    * test_pt: TestPoint data structure; this function relies on the
+               following fields: batch_size, num_heads, head_size, 
+               block_size, max_q_seq_len
+    * test_rsrcs: TestResources data structure; this function relies on the
+                  scale field
+    * block_base_addr: decoder self-attention block-table base address
+
+    Returns:
+
+    * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data 
+      structure, including (1) packed 
+      (number_of_tokens x num_heads x head_size) query/key/value tensors
+      along with (2) ideal attention output computed using a 
+      naive implementation, and (3) memory-mapping data structures appropriate
+      for prefill phase.
+    * Decode-phase encoder/decoder cross-attention PhaseTestParameters data 
+      structure, including (1) packed
+      (number_of_tokens x num_heads x head_size) query/key/value tensors
+      along with (2) ideal attention output computed using a 
+      naive implementation, and (3) memory-mapping data structures appropriate
+      for decode phase.
+    '''
+
+    assert encoder_test_params.packed_qkvo.packed_qkv is not None
+    assert prefill_decoder_phase_test_params.packed_qkvo.packed_qkv is not None
+
+    (
+        num_heads,
+        head_size,
+        _,
+        batch_size,
+        block_size,
+        max_decoder_seq_len,
+        max_encoder_seq_len,
+        _,
+    ) = test_pt
+
+    scale = test_rsrcs.scale
+
+    decoder_query = decoder_qkv.query
+    decoder_seq_lens = decoder_qkv.q_seq_lens
+    encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
+    prefill_q_seq_lens = (
+        prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens)
+
+    assert prefill_q_seq_lens is not None
+
+    (
+        cross_kv,
+        _,
+        _,
+    ) = make_qkv(batch_size,
+                 max_decoder_seq_len,
+                 max_encoder_seq_len,
+                 num_heads,
+                 head_size,
+                 force_kv_seq_lens=encoder_seq_lens,
+                 attn_type=AttentionType.ENCODER_DECODER,
+                 device=CUDA_DEVICE)
+
+    ideal_output = ref_masked_attention(decoder_query,
+                                        cross_kv.key,
+                                        cross_kv.value,
+                                        scale=scale,
+                                        q_seq_lens=decoder_seq_lens,
+                                        kv_seq_lens=cross_kv.kv_seq_lens)
+
+    prefill_ideal_output = torch.zeros_like(ideal_output)
+    decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
+    for bdx, prefill_q_seq_len in enumerate(prefill_q_seq_lens):
+        prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
+            bdx, :prefill_q_seq_len]
+        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
+            prefill_q_seq_len + 1)]
+
+    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
+                                                 prefill_q_seq_lens,
+                                                 device=CUDA_DEVICE)
+    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
+                                                [1 for _ in range(batch_size)],
+                                                device=CUDA_DEVICE)
+
+    # Build prefill- & decode-phase data structures
+    # for encoder/decoder cross-attention. Block tables and
+    # slot mapping must be in a format compatible
+    # with KV caching & attention kernels
+    #
+    # Whereas decoder self-attention extracts relationships between
+    # equal-length Q/K/V sequences, which mutually grow in length
+    # with each decoded token, cross-attention relates the Q sequence
+    # - which grows with each new decoded token - to fixed-length
+    # K and V sequences derived from the encoder hidden states.
+    #
+    # Prefill-phase:
+    #
+    # * Empty block-tables tensor
+    # * Slot-mapping with as many entries as there are tokens in the encoder
+    #   prompt.
+    #
+    # Decode-phase:
+    # * Block-tables tensor with minimum number of blocks to
+    #   accommodate K & V tensors which are equal in lnegth
+    #   to the encoder prompt length
+    # * Empty slot-mapping tensor (since K & V are fixed in size,
+    #   new decoded tokens are not KV-cached and require no slot-
+    #   mapping)
+    #
+    # Note: the format above is simply an extension of what ModelRunner
+    #       produces for decoder-only models
+
+    prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
+    decode_slot_mapping = make_empty_slot_mapping_tensor(device=CUDA_DEVICE)
+
+    (
+        decode_block_tables,
+        prefill_slot_mapping_list,
+        _,
+    ) = make_block_tables_slot_mapping(block_size,
+                                       cross_kv.kv_seq_lens,
+                                       block_base_addr=block_base_addr,
+                                       device=CUDA_DEVICE)
+
+    prefill_slot_mapping = maybe_make_long_tensor(prefill_slot_mapping_list,
+                                                  device=CUDA_DEVICE)
+
+    # Packed key/value (query is already provided)
+    packed_cross_kv = pack_qkv(cross_kv, device=CUDA_DEVICE)
+
+    return (
+        PhaseTestParameters(  # Prefill-phase test params
+            PackedQKVO(packed_cross_kv, prefill_packed_ideal_output),
+            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
+        PhaseTestParameters(  # Decode-phase test params
+            PackedQKVO(None, decode_packed_ideal_output),
+            KVMemoryMap(decode_block_tables, decode_slot_mapping)))
+
+
+def _run_encoder_attention_test(
+    attn: Attention,
+    encoder_test_params: PhaseTestParameters,
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    '''
+    Run encoder attention.
+
+    attn.forward() is passed attn_type=AttentionType.ENCODER in order 
+    to configure the kernel invocation for encoder attention
+
+    Requires attn_metadata.num_decode_tokens == 0
+    (There is no encoder execution in the decode-phase)
+
+    Arguments:
+
+    * attn: Attention wrapper instance
+    * encoder_test_params: encoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size) 
+                           query/key/value fields
+    * attn_metadata: attention metadata for encoder/decoder-self attention
+
+    Returns:
+    * Attention.forward() applied to packed {query,key,value} and
+      & attn_metadata
+    '''
+    assert attn_metadata.num_decode_tokens == 0
+    attn_type = AttentionType.ENCODER
+    packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
+    assert packed_qkv is not None
+    return attn.forward(packed_qkv.query,
+                        packed_qkv.key,
+                        packed_qkv.value,
+                        None,
+                        attn_metadata,
+                        attn_type=attn_type)
+
+
+def _run_decoder_self_attention_test(
+    test_rsrcs: TestResources,
+    decoder_test_params: PhaseTestParameters,
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    '''
+    Run decoder self-attention test.
+
+    attn.forward() is passed attn_type=AttentionType.DECODER
+    in order to configure the kernel invocation for decoder self-attention.
+
+    Arguments:
+
+    * test_rsrcs: TestResources instance; this function relies on the kv_cache
+                  and attn (Attention wrapper instance) fields
+    * decoder_test_params: decoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size) 
+                           query/key/value fields
+    * attn_metadata: attention metadata for decoder-self attention
+                     (contains KV cache memory-mapping)
+
+    Returns:
+    * Attention.forward() applied to packed_{query,key,value}, kv_cache
+      & attn_metadata
+    '''
+    attn_type = AttentionType.DECODER
+    attn = test_rsrcs.attn
+    kv_cache = test_rsrcs.kv_cache
+    packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
+    assert packed_qkv is not None
+    return attn.forward(packed_qkv.query,
+                        packed_qkv.key,
+                        packed_qkv.value,
+                        kv_cache,
+                        attn_metadata,
+                        attn_type=attn_type)
+
+
+def _run_encoder_decoder_cross_attention_test(
+    test_rsrcs: TestResources,
+    decoder_test_params: PhaseTestParameters,
+    cross_test_params: Optional[PhaseTestParameters],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    '''
+    Run encoder/decoder cross-attention test.
+
+    Via PhaseTestParameters data structures, consumes the same query utilized
+    for decoder self-attention, plus a key/value specific to cross-attention.
+
+    if cross_test_params is None or cross_test_params.packed_qkvo.packed_qkv
+    is None, this reflects that in decode-phase cross attention there
+    is no growth in the key and value tensors.
+
+    attn.forward() is passed attn_type=AttentionType.ENCODER_DECODER
+    in order to configure the kernel invocation for encoder/decoder cross-
+    attention.
+
+    Arguments:
+
+    * test_rsrcs: TestResources instance; this function relies on the kv_cache
+                  and attn (Attention wrapper instance) fields
+    * decoder_test_params: decoder PhaseTestParameters data structure;
+                           this function relies on the packed
+                           (number_of_tokens x num_heads x head_size) 
+                           query field
+    * cross_test_params: encoder/decoder PhaseTestParameters data structure;
+                         this function relies on the packed
+                         (number_of_tokens x num_heads x head_size) 
+                         key/value fields
+    * attn_metadata: attention metadata for encoder/decoder-self attention
+
+    Returns:
+    * Attention.forward() applied to packed_{query,key,value}, kv_cache
+      & attn_metadata
+    '''
+    assert decoder_test_params.packed_qkvo.packed_qkv is not None
+
+    attn_type = AttentionType.ENCODER_DECODER
+    attn = test_rsrcs.attn
+    kv_cache = test_rsrcs.kv_cache
+    if cross_test_params is None:
+        key = None
+        value = None
+    else:
+        cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
+        key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
+        value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
+    return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query,
+                        key,
+                        value,
+                        kv_cache,
+                        attn_metadata,
+                        attn_type=attn_type)
+
+
+@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("backend_name", BACKEND_NAMES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
+@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
+def test_encoder_only(num_heads: int, head_size: int, backend_name: str,
+                      batch_size: int, block_size: int, max_dec_seq_len: int,
+                      max_enc_seq_len: int, monkeypatch):
+
+    # Force Attention wrapper backend
+    override_backend_env_variable(monkeypatch, backend_name)
+
+    # Note: KV cache size of 4096 is arbitrary & chosen intentionally
+    # to be more than necessary, since exceeding the kv cache size
+    # is not part of this test
+    test_pt = TestPoint(num_heads, head_size, backend_name, batch_size,
+                        block_size, max_dec_seq_len, max_enc_seq_len, 4096)
+
+    # Attention scale factor, attention backend instance, attention wrapper
+    # instance, KV cache init
+    test_rsrcs = _make_test_resources(test_pt)
+
+    # Construct encoder attention test params (only used
+    # during prefill)
+
+    enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
+
+    # Shared prefill metadata structure
+
+    prephase_attn_metadata: AttentionMetadata = make_test_metadata(
+        test_rsrcs.attn_backend,
+        True,
+        None,
+        decoder_test_params=None,
+        encoder_test_params=enc_test_params,
+        cross_test_params=None,
+        device=CUDA_DEVICE)
+
+    # PREFILL: encoder attention
+
+    enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
+        test_rsrcs.attn, enc_test_params, prephase_attn_metadata))
+
+    # - Is encoder attention result correct?
+    assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+
+
+@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("backend_name", BACKEND_NAMES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
+@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
+def test_e2e_enc_dec_attn(
+    num_heads: int,
+    head_size: int,
+    backend_name: str,
+    batch_size: int,
+    block_size: int,
+    max_dec_seq_len: int,
+    max_enc_seq_len: int,
+    monkeypatch,
+) -> None:
+    '''
+    End-to-end encoder/decoder test:
+
+    * Construct fake test vectors for (1) encoder attention,
+      (2) decoder self-attention, and (3) encoder/decoder cross-attention
+    * Construct (1) attention metadata structure with self- and cross-attention
+      attributes for prefill-phase, and (2) an analogous attention metadata
+      structure but for decode-phase
+    * Test attention steps in the following order
+    
+        * Encoder attention
+        * Prefill self-attention
+        * Prefill cross-attention
+        * Decode self-attention
+        * Decode cross-attention
+        * Besides being reflective of realistic use-cases, this order would 
+          exacerbate any accidental overlap in the self-/cross-attention 
+          block tables, which one hopes to avoid
+
+
+    * Validate output correctness against ideal reference attention
+      implementation
+
+    Block tables are constructed such that cross-attention KV cache is in a
+    higher, non-intersecting address-space than self-attention KV cache.
+
+    Self- and cross-attention share the same query tensor but not the K/V
+    tensors. Self-attention K/Vs must have the same seq len as Q while
+    cross-attention K/Vs are allowed to differ in seq len, as is often the case
+    for cross-attention.
+
+    This test utilizes PyTest monkey patching to force the attention backend
+    via an environment variable.
+
+    Note on ROCm/HIP: currently encoder/decoder models are not supported on
+    AMD GPUs, therefore this test simply is skipped if is_hip(). 
+
+    Note on metadata: there is a single attention metadata structure shared by
+    all prefill-phase attention operations (encoder, decoder, enc/dec cross), 
+    and a single one shared by all decode-phase attention operations
+    (decoder & enc/dec cross.) This is intended to reflect the behavior
+    of ModelRunner, which constructs a single attention metadata structure for
+    each prefill or decode run. A realistic scenario would rely on the
+    attention backend to utilize the appropriate attention metadata fields
+    according to the value of attn_metadata.attention_type. Thus, this test is
+    organized so as to confirm that the backend-under-test can handle a
+    shared prefill attention metadata structure & a shared decode attention
+    metadata structure.
+    '''
+
+    # Force Attention wrapper backend
+    override_backend_env_variable(monkeypatch, backend_name)
+
+    # Note: KV cache size of 4096 is arbitrary & chosen intentionally
+    # to be more than necessary, since exceeding the kv cache size
+    # is not part of this test
+    test_pt = TestPoint(num_heads, head_size, backend_name, batch_size,
+                        block_size, max_dec_seq_len, max_enc_seq_len, 4096)
+
+    # Attention scale factor, attention backend instance, attention wrapper
+    # instance, KV cache init
+    test_rsrcs = _make_test_resources(test_pt)
+
+    # Construct encoder attention test params (only used
+    # during prefill)
+
+    enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
+
+    # Construct Decoder self-attention prefill-phase & decode-phase
+    # test params, including query/key/value tensors, decoder self-attention
+    # memory-mapping. cross_block_base_addr is the uppermost address in the
+    # decoder self-attention block-table, i.e. a base address which the
+    # encoder/decoder cross-attention block-table may build downward toward.
+
+    (
+        dec_qkv,
+        prephase_dec_test_params,
+        decphase_dec_test_params,
+        cross_block_base_addr,
+    ) = _decoder_attn_setup(test_pt, test_rsrcs)
+
+    # Construct encoder/decoder cross-attention prefill-phase & decode-phase
+    # test params, including key/value tensors, cross-attention memory-mapping
+
+    (
+        prephase_cross_test_params,
+        decphase_cross_test_params,
+    ) = _enc_dec_cross_attn_setup_reuses_query(
+        dec_qkv,
+        enc_test_params,
+        prephase_dec_test_params,
+        test_pt,
+        test_rsrcs,
+        block_base_addr=cross_block_base_addr)
+
+    # Shared prefill metadata structure
+    assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
+    prephase_attn_metadata: AttentionMetadata = make_test_metadata(
+        test_rsrcs.attn_backend,
+        True,
+        prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens,
+        decoder_test_params=prephase_dec_test_params,
+        encoder_test_params=enc_test_params,
+        cross_test_params=prephase_cross_test_params,
+        device=CUDA_DEVICE)
+
+    # PREFILL: encoder attention
+
+    enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
+                                                   enc_test_params,
+                                                   prephase_attn_metadata)
+
+    # - Is encoder attention result correct?
+    assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+
+    # PREFILL: decoder self-attention test
+
+    prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
+        test_rsrcs, prephase_dec_test_params, prephase_attn_metadata)
+
+    # - Is prefill decoder self-attention correct?
+    assert_actual_matches_ideal(prephase_dec_test_params,
+                                prephase_dec_pckd_act_out)
+
+    # PREFILL: encoder/decoder cross-attention test
+
+    prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
+        test_rsrcs, prephase_dec_test_params, prephase_cross_test_params,
+        prephase_attn_metadata)
+
+    # - Is prefill encoder/decoder cross-attention correct?
+    assert_actual_matches_ideal(prephase_cross_test_params,
+                                prephase_cross_pckd_act_out)
+
+    # DECODE: build decode-phase attention metadata
+
+    decphase_attn_metadata: AttentionMetadata = make_test_metadata(
+        test_rsrcs.attn_backend,
+        False,
+        dec_qkv.q_seq_lens,
+        decoder_test_params=decphase_dec_test_params,
+        encoder_test_params=enc_test_params,
+        cross_test_params=decphase_cross_test_params,
+        device=CUDA_DEVICE)
+
+    # DECODE: decoder self-attention test
+
+    decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
+        test_rsrcs, decphase_dec_test_params, decphase_attn_metadata)
+
+    # - Is decode-phase decoder self-attention correct?
+    assert_actual_matches_ideal(decphase_dec_test_params,
+                                decphase_dec_pckd_act_out)
+
+    # DECODE: encoder/decoder cross-attention test
+
+    decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
+        test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata)
+
+    # - Is decode-phase encoder/decoder cross-attention correct?
+    assert_actual_matches_ideal(decphase_cross_test_params,
+                                decphase_cross_pckd_act_out)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index b401eb87d..23d627820 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1,12 +1,211 @@
 """Kernel test utils"""
 
+import itertools
+import random
+from numbers import Number
+from typing import Any, List, NamedTuple, Optional, Tuple, Union
+
 import pytest
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.xformers import XFormersBackend
+from vllm.utils import make_tensor_with_pad
 
+# String name of register which may be set in order to
+# force auto-selection of attention backend by Attention
+# wrapper
 STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
+
+# Possible string values of STR_BACKEND_ENV_VAR
+# register, corresponding to possible backends
+STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
+STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
+STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH"
+STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
 
+class QKVInputs(NamedTuple):
+    '''
+    Data structure for representing unpacked attention inputs, 
+    query/key/values and their sequence lengths.
+
+    Attributes:
+
+        * {query,key,value}: unpacked (batch_size x padded_seq_len x 
+                             num_heads x head_size) attention inputs
+        * q_seq_lens: query sequence lengths list
+        * kv_seq_lens: shared key/value sequence lengths list
+    '''
+
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+    q_seq_lens: List[int]
+    kv_seq_lens: List[int]
+
+
+class QKVO(NamedTuple):
+    '''
+    Data structure for representing unpacked attention inputs, 
+    alongside unpacked known-correct attention output
+
+    Attributes:
+
+        * qkv: unpacked (batch_size x padded_seq_len x 
+                             num_heads x head_size) attention inputs
+        * ideal_output: unpacked (batch_size x padded_seq_len x 
+                        num_heads x head_size) known-correct attention output
+    '''
+
+    qkv: QKVInputs
+    ideal_output: torch.Tensor
+
+
+class PackedQKVInputs(NamedTuple):
+    '''
+    Data structure for representing packed attention inputs
+
+    Attributes:
+
+        * {query,key,value}: packed (number_of_tokens x num_heads 
+                             x head_size) attention inputs
+        * q_start_loc_list: list of query start locations within packed tensor
+        * kv_start_loc_list: shared list of key/value start locations within
+                             packed tensor
+        * q_seq_lens: query sequence lengths list
+        * kv_seq_lens: shared key/value sequence lengths list
+    '''
+
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+    q_start_loc_list: Optional[List[int]]
+    kv_start_loc_list: Optional[List[int]]
+    q_seq_lens: Optional[List[int]]
+    kv_seq_lens: Optional[List[int]]
+
+
+class PackedQKVO(NamedTuple):
+    '''
+    Data structure for representing packed attention inputs, 
+    alongside packed known-correct attention output
+
+    Attributes:
+
+        * packed_qkv: packed (number_of_tokens x num_heads 
+                      x head_size) attention inputs
+        * ideal_output: packed (number_of_tokens x num_heads 
+                        x head_size) known-correct attention output
+    '''
+
+    packed_qkv: Optional[PackedQKVInputs]
+    ideal_output: torch.Tensor
+
+
+class KVMemoryMap(NamedTuple):
+    '''
+    Data structure for encapsulating KV cache memory mapping.
+
+    Attributes:
+
+        * block_tables: KV cache block tables
+        * slot_mapping: mapping of sequence offset to physical address
+    '''
+
+    block_tables: torch.Tensor
+    slot_mapping: torch.Tensor
+
+
+class PhaseTestParameters(NamedTuple):
+    '''
+    Data structure for encapsulating the test parameters
+    for a given test "phase" (prefill or decode phase) and attention
+    scenario (encoder, decoder-self, encoder/decoder-cross)
+
+    Attributes:
+
+        * packed_qkvo: packed (number_of_tokens x num_heads 
+                       x head_size) attention inputs & known-correct
+                       output
+        * kv_mmap: KV cache memory mapping, specific to this test phase &
+                   attention scenario
+    '''
+
+    packed_qkvo: PackedQKVO
+    kv_mmap: Optional[KVMemoryMap]
+
+
+def maybe_make_int_tensor(
+    _list: Optional[List[int]],
+    device: Union[torch.device, str],
+) -> torch.Tensor:
+    '''
+    Convert Python int list to a 1D int torch.Tensor on `device`
+
+    Returns:
+
+    * If _list is not None: 1D int torch.Tensor on `device`
+    * None otherwise
+    '''
+    return None if _list is None else torch.tensor(
+        _list, dtype=torch.int, device=device)
+
+
+def maybe_make_long_tensor(
+    _list: Optional[List[int]],
+    device: Union[torch.device, str],
+) -> torch.Tensor:
+    '''
+    Convert Python int list to a 1D long torch.Tensor on `device`
+
+    Returns:
+
+    * If _list is not None: 1D long torch.Tensor on `device`
+    * None otherwise
+    '''
+    return None if _list is None else torch.tensor(
+        _list, dtype=torch.long, device=device)
+
+
+def maybe_max(_list: Optional[List]) -> Optional[Number]:
+    '''
+    Returns:
+
+    * If _list is not None: max(_list)
+    * None otherwise
+    '''
+    return None if _list is None else max(_list)
+
+
+def make_causal_mask(
+    q_max_seq_len: int,
+    kv_max_seq_len: int,
+) -> torch.Tensor:
+    '''
+    Create a q_max_seq_len x kv_max_seq_len causal mask
+
+    Arguments:
+    
+    * q_max_seq_len: query max seq len
+    * kv_max_seq_len: key/value max seq len
+
+    Returns:
+
+    * 2D tensor, q_max_seq_len x kv_max_seq_len
+    '''
+
+    # Create a matrix where entry (i, j) is True if i >= j
+    mask = torch.triu(torch.ones(q_max_seq_len, kv_max_seq_len), diagonal=1)
+    # Replace True with float('-inf') and False with 0
+    mask = mask.masked_fill(mask == 1,
+                            float('-inf')).masked_fill(mask == 0, 0.0)
+    return mask
+
+
 def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
                                   backend_name: str) -> None:
     '''
@@ -20,3 +219,724 @@ def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
     * backend_name: attention backend name to force
     '''
     mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
+
+
+def ref_masked_attention(query: torch.Tensor,
+                         key: torch.Tensor,
+                         value: torch.Tensor,
+                         scale: float,
+                         custom_mask: Optional[torch.Tensor] = None,
+                         q_seq_lens: Optional[List] = None,
+                         kv_seq_lens: Optional[List] = None) -> torch.Tensor:
+    '''
+    "Golden" masked attention reference. Supports two types of masking:
+
+    * Basic attention mask, utilizing {q,kv}_seq_lens args to mask out
+      padding elements
+    * Custom attention mask, which can force an arbitrary mask tensor, i.e.
+      causal
+
+    Arguments:
+
+    * query: batch_size x q_padded_seq_len x num_heads x head_size
+    * key: batch_size x kv_padded_seq_len x num_heads x head_size
+    * value: batch_size x kv_padded_seq_len x num_heads x head_size
+    * scale: Attention scale factor
+    * custom_mask: custom attention mask; good place to inject a causal
+      attention mask
+    * q_seq_lens: list of unpadded query seq_lens for each batch index
+    * kv_seq_lens: list of unpadded key/value seq_lens for each batch index
+
+    Returns:
+
+    * Attention result, batch_size x q_padded_seq_len x num_heads x head_size
+    '''
+
+    assert q_seq_lens is not None
+    assert kv_seq_lens is not None
+
+    batch_size = query.shape[0]
+    assert (len(q_seq_lens) == batch_size)
+    assert (len(kv_seq_lens) == batch_size)
+
+    attn_weights = scale * torch.einsum("bqhd,bkhd->bhqk", query, key).float()
+
+    # Basic attention mask, derived from seq lens
+    if (q_seq_lens is not None) or (kv_seq_lens is not None):
+        attn_mask = torch.zeros_like(attn_weights)
+        if q_seq_lens is not None:
+            for bdx, plen in enumerate(q_seq_lens):
+                attn_mask[bdx, :, plen:, :] = -torch.inf
+        if kv_seq_lens is not None:
+            for bdx, plen in enumerate(kv_seq_lens):
+                attn_mask[bdx, :, :, plen:] = -torch.inf
+
+        attn_weights = attn_weights + attn_mask.float()
+
+    # Custom attention mask
+    if custom_mask is not None:
+        attn_weights = attn_weights + custom_mask.float()
+
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("bhqk,bkhd->bqhd", attn_weights, value)
+    return out
+
+
+def make_qkv(
+    batch_size: int,
+    max_q_seq_len: int,
+    max_kv_seq_len: Optional[int],
+    num_heads: int,
+    head_size: int,
+    device: Union[torch.device, str],
+    force_kv_seq_lens: Optional[List[int]] = None,
+    attn_type: AttentionType = AttentionType.ENCODER_DECODER,
+    force_max_len: bool = False,
+) -> Tuple[QKVInputs, QKVInputs, QKVInputs]:
+    '''
+    Construct QKV test tensors for self- and cross-attention.
+
+    Generates three query/key/value triplets:
+
+    * "Baseline" query/key/value (for input to reference attention function)
+    * "Prefill" query/key/value (last sequence offset zero'd out, for use as
+      input to prefill kernel)
+    * "Decode" query/key/value (only the last sequence offset  from baseline,
+      for use as input to decode kernel)
+
+    Each Q/K/V triplet is associated with a list of q seqlens and a list of k/v
+    seqlens
+
+    Arguments:
+
+    * batch_size
+    * max_q_seq_len: max query seq len
+    * max_kv_seq_len: max key/value seq len
+    * num_heads
+    * head_size
+    * is_encoder_decoder_attn: if True, query seqlen may differ from 
+      key/value seqlen (as is often the case for cross-attention); 
+      o/w, query/key/value seqlens match at each batch index 
+      (max_kv_seq_len is unused)
+    * force_kv_seq_lens: if not None, overrides kv sequence lengths
+    * attn_type: encoder, decoder self, or enc/dec cross attention
+    * force_max_len: if True, all query seqlens are max_q_seq_len; o/w query
+      seqlens are random in [2,max_q_seq_lens]. Same for key/value seqlens
+      and max_kv_seq_len, unless forced by is_encoder_decoder_attn=False
+    * device: CPU or CUDA device
+
+    Returns:
+
+    * Overall QKVInputs structure (containing full unpacked Q/K/V tensors)
+    * Prefill QKVInputs structure (containing all but the last sequence offset)
+    * Decode QKVInputs structure (containing all only the last sequence offset)
+    '''
+
+    if force_max_len:
+        q_seq_lens = [max_q_seq_len for _ in range(batch_size)]
+    else:
+        q_seq_lens = [
+            random.randint(2, max_q_seq_len) for _ in range(batch_size)
+        ]
+    kv_seq_lens = None
+    if force_kv_seq_lens is not None:
+        kv_seq_lens = force_kv_seq_lens
+    elif attn_type != AttentionType.ENCODER_DECODER:
+        # K,V seq lens match Q for self-attention
+        kv_seq_lens = q_seq_lens
+    else:
+        # K,V seq lens are distinct from Q seq lens & random
+        assert max_kv_seq_len is not None
+        if force_max_len:
+            kv_seq_lens = [max_kv_seq_len] * batch_size
+        else:
+            kv_seq_lens = [
+                random.randint(2, max_kv_seq_len) for _ in range(batch_size)
+            ]
+
+    query = torch.rand(
+        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
+    key = torch.rand(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+    value = torch.rand(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+
+    prefill_query = torch.zeros(
+        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
+    prefill_key = torch.zeros(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+    prefill_value = torch.zeros(
+        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+
+    decode_query = torch.zeros(
+        (batch_size, 1, num_heads, head_size)).to(device)
+    decode_key = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
+    decode_value = torch.zeros(
+        (batch_size, 1, num_heads, head_size)).to(device)
+
+    for bdx, (q_seq_len, kv_seq_len) in enumerate(zip(q_seq_lens,
+                                                      kv_seq_lens)):
+        query[bdx, q_seq_len:, :, :] = 0
+        key[bdx, kv_seq_len:, :, :] = 0
+        value[bdx, kv_seq_len:, :, :] = 0
+
+        prefill_query[bdx,
+                      0:(q_seq_len - 1), :, :] = query[bdx,
+                                                       0:(q_seq_len - 1), :, :]
+        prefill_key[bdx,
+                    0:(kv_seq_len - 1), :, :] = key[bdx,
+                                                    0:(kv_seq_len - 1), :, :]
+        prefill_value[bdx, 0:(kv_seq_len -
+                              1), :, :] = value[bdx, 0:(kv_seq_len - 1), :, :]
+
+        decode_query[bdx, :, :, :] = query[bdx,
+                                           (q_seq_len - 1):q_seq_len, :, :]
+        decode_key[bdx, :, :, :] = key[bdx, (kv_seq_len - 1):kv_seq_len, :, :]
+        decode_value[bdx, :, :, :] = value[bdx,
+                                           (kv_seq_len - 1):kv_seq_len, :, :]
+
+    prefill_q_seq_lens = [plen - 1 for plen in q_seq_lens]
+    prefill_kv_seq_lens = [plen - 1 for plen in kv_seq_lens]
+
+    decode_q_seq_lens = [1 for _ in q_seq_lens]
+    decode_kv_seq_lens = [1 for _ in kv_seq_lens]
+
+    return (
+        QKVInputs(
+            query,  # Overall QKV inputs
+            key,
+            value,
+            q_seq_lens,
+            kv_seq_lens),
+        QKVInputs(
+            prefill_query,  # Prefill subset of QKV sequences
+            prefill_key,
+            prefill_value,
+            prefill_q_seq_lens,
+            prefill_kv_seq_lens),
+        QKVInputs(
+            decode_query,  # Decode subset of KV sequences
+            decode_key,
+            decode_value,
+            decode_q_seq_lens,
+            decode_kv_seq_lens))
+
+
+def pack_tensor(
+        unpacked_tensor: torch.Tensor, seq_lens: List[int],
+        device: Union[torch.device, str]) -> Tuple[torch.Tensor, List[int]]:
+    '''
+    Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an
+    unpadded number_of_tokens x num_heads x head_size tensor, where
+    number_of_tokens = sum(seq_lens)
+
+    Arguments:
+
+    * unpacked_tensor: batch_size x padded_seq_len x num_heads x head_size
+    * seq_lens: list of token counts for each seq
+    * device: CPU or CUDA device
+
+    Returns
+
+    * packed_tensor: number_of_tokens x num_heads x head_size
+    * start_loc_list: start idx of each batch elt in packed_tensor; [0] +
+      list(itertools.accumulate(seq_lens))
+    '''
+
+    num_tok = sum(seq_lens)
+    num_heads = unpacked_tensor.shape[-2]
+    head_size = unpacked_tensor.shape[-1]
+    start_loc_list = [0] + list(itertools.accumulate(seq_lens))
+    packed_tensor = torch.zeros((num_tok, num_heads, head_size), device=device)
+
+    for bdx, (seq_len, start_loc) in enumerate(zip(seq_lens, start_loc_list)):
+
+        packed_tensor[start_loc:(
+            start_loc + seq_len), :, :] = unpacked_tensor[bdx, :seq_len, :, :]
+
+    return packed_tensor, start_loc_list
+
+
+def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
+                                           str]) -> PackedQKVInputs:
+    '''
+    Individually pack each of Q, K and V, each with dimensions batch_size x
+    padded_seq_len x num_heads x head_size, into respective number_of_tokens x
+    num_heads x head_size tensors.
+    
+    For Q, number_of_tokens = sum(q_seq_lens).
+
+    For K and V, number_of_tokens = sum(kv_seq_lens)
+
+    Arguments:
+
+    * qkv: Unpacked (batch_size x padded_seq_len x num_heads x head_size)
+           attention inputs
+    * device: CPU or CUDA device
+
+    Returns
+
+    * Packed (number_of_tokens x num_heads x head_size) QKV inputs
+      derived from unpacked inputs
+    '''
+
+    if qkv.query is None:
+        packed_query = None
+        q_start_loc_list = None
+    else:
+        packed_query, q_start_loc_list = pack_tensor(qkv.query,
+                                                     qkv.q_seq_lens,
+                                                     device=device)
+    packed_key, kv_start_loc_list = pack_tensor(qkv.key,
+                                                qkv.kv_seq_lens,
+                                                device=device)
+    packed_value, _ = pack_tensor(qkv.value, qkv.kv_seq_lens, device=device)
+    return PackedQKVInputs(
+        packed_query, packed_key, packed_value, q_start_loc_list,
+        kv_start_loc_list,
+        (None if q_start_loc_list is None else qkv.q_seq_lens),
+        qkv.kv_seq_lens)
+
+
+def make_backend(backend_name: str) -> AttentionBackend:
+    '''
+    Construct the backend instance determined by the backend_name string
+    argument.
+
+    "XFORMERS" -> construct xformers backend
+
+    TODO: other backends
+
+    Note: at time of writing the Attention wrapper automatically selects
+    its own backend for Attention.forward(); so the backend instance which
+    you generate with this function is not meant to be used for *running*
+    inference, but rather for generating compatible metadata structures
+    using backend.make_metadata()
+
+
+    Returns:
+
+    * Backend instance
+    '''
+    if backend_name == STR_XFORMERS_ATTN_VAL:
+        return XFormersBackend()
+    raise AssertionError(
+        f"Unrecognized backend_name {backend_name} for unit test")
+
+
+def _make_metadata_tensors(
+    seq_lens: Optional[List[int]], context_lens: Optional[List[int]],
+    encoder_seq_lens: Optional[List[int]], device: Union[torch.device, str]
+) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[List[int]],
+           torch.Tensor, Optional[int]]:
+    '''
+    Build scalar & tensor values required to build attention metadata structure.
+
+    Arguments:
+
+    * seq_lens: list of token-counts for each decoder input seq
+    * context_lens: list of context length values for each seq
+    * encoder_seq_lens: list of token-counts for each encoder input seq
+    * device: CPU or CUDA device
+
+    Returns:
+
+    * seq_lens_tensor: decoder seq_lens list, as tensor
+    * context_lens_tensor: context_lens list, as tensor
+    * max_context_len: max(context_lens)
+    * max_seq_len: max(seq_lens)
+    * seq_start_loc: start idx of each sequence
+    * max_encoder_seq_len: encoder seq_lens list, as tensor
+    '''
+    seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
+    context_lens_tensor = maybe_make_int_tensor(context_lens, device)
+    max_context_len = maybe_max(context_lens)
+    max_seq_len = maybe_max(seq_lens)
+
+    encoder_seq_lens_tensor = maybe_make_int_tensor(encoder_seq_lens, device)
+    max_encoder_seq_len = (None if encoder_seq_lens is None else
+                           max(encoder_seq_lens))
+
+    seq_start_loc = None
+
+    return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
+            seq_start_loc, encoder_seq_lens_tensor, max_encoder_seq_len)
+
+
+def make_kv_cache(num_blocks: int,
+                  num_heads: int,
+                  head_size: int,
+                  block_size: int,
+                  device: Union[torch.device, str],
+                  default_val: float = 0.0) -> torch.Tensor:
+    '''
+    Create a fake KV cache.
+
+    Arguments:
+
+    * num_blocks: number of blocks in the KV cache
+    * num_heads: number of attention heads
+    * head_size: head dimension
+    * block_size: number of offsets within a block
+    * device: CPU or CUDA device
+    * default_val: initialization value for KV cache elements
+
+    Returns:
+
+    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
+    '''
+
+    kv_cache = torch.rand(
+        (2, num_blocks, block_size * num_heads * head_size)).to(device)
+    if default_val is not None:
+        kv_cache[:, :, :] = default_val
+    return kv_cache
+
+
+def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int:
+    '''
+    Compute the minimum number of blocks required to hold num_tokens tokens,
+    given block_size
+    '''
+    return (num_tokens + block_size) // block_size
+
+
+def make_empty_slot_mapping_tensor(device: Union[torch.device, str]):
+    return maybe_make_long_tensor([], device)
+
+
+def make_empty_block_tables_tensor(device: Union[torch.device, str]):
+    return torch.tensor([], device=device)
+
+
+def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int],
+                       device: Union[torch.device, str]):
+    '''
+    Split a slot mapping into valid prefill- and decode-phase slot mappings.
+
+    Context:
+    * Your goal is to test (1) prefill of N prompts, with prompt-lengths
+      {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
+      for all N prompts (N tokens total); the resultant sequence lengths 
+      after decode would be {K_i + 1 for i \\in [0,N)}
+    * The test you want to do requires (1) having the prefill slot mapping 
+      for all tokens present during prefill, the number of which is 
+      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N 
+      decoded tokens
+    
+    This function consumes a single 1D slot mapping, which is the 
+    concatenation of N slot mappings each of length K_i + 1 (corresponding
+    to the  sequence lengths after decode), with a total length of
+    P = \\sum_i{K_i + 1} = M + N
+
+    The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
+    from each of the N subsequences in the slot mapping (i.e. omitting the 
+    decoded token's mapping.)
+
+    The N excised entries are appended to obtain the decode-phase slot mapping
+
+    Arguments:
+
+    * slot_mapping_list: Length-P 1D slot mapping (as List) reflecting all N
+      post-decode sequences
+    * seq_lens: List of N post-decode sequence lengths (K_i + 1 in the 
+      description above)
+    * device: cuda, cpu, etc.
+
+    Returns:
+
+    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) 
+      reflecting all N prefill prompts
+    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting 
+      all N decoded tokens
+    '''
+
+    prefill_slot_mapping = []
+    decode_slot_mapping = []
+
+    base_idx = 0
+    for seq_len in seq_lens:
+        prefill_slot_mapping.extend(slot_mapping_list[base_idx:(base_idx +
+                                                                seq_len - 1)])
+        decode_slot_mapping.append(slot_mapping_list[base_idx + seq_len - 1])
+        base_idx += seq_len
+
+    return (maybe_make_long_tensor(prefill_slot_mapping, device),
+            maybe_make_long_tensor(decode_slot_mapping, device))
+
+
+def make_block_tables_slot_mapping(
+        block_size: int,
+        seq_lens: List[int],
+        device: Union[torch.device, str],
+        block_base_addr: int = 0) -> Tuple[torch.Tensor, List[int], int]:
+    '''
+    Construct fake block tables & slot mappings.
+
+    For a sequence with num_tokens tokens the minimum number
+    of required KV cache blocks is
+
+    num_blocks = (num_tokens + block_size) // block_size
+
+    Then the minimum KV cache size in blocks is
+
+    total_cache_blocks = sum(num_blocks for all seqs) 
+
+    Then, the blocktable mapping counts downward from
+
+    block_base_addr + total_cache_blocks
+
+    to
+
+    block_base_addr
+    
+
+    The constructed block-tables and slot-mapping are sized to the
+    lengths of the sequences in their entirety (as reflected by seq_lens),
+    i.e. the total of prefill prompt tokens + decoded tokens.
+
+    Arguments:
+
+    * block_size: number of offsets per block
+    * seq_lens: list of token-counts for each sequence
+    * block_base_addr: the block table base address
+    * device: CPU or CUDA device
+
+    Return:
+
+    * block_tables_tensor: block table for sequence   
+    * slot_mapping_list: slot mapping for sequence
+    * max_block_idx: the highest block address within this block table
+    '''
+
+    # Provision minimum number of KV cache blocks
+    num_blocks_list = [
+        _num_tokens_to_min_blocks(num_tokens, block_size)
+        for num_tokens in seq_lens
+    ]
+    max_block_table_len = max(num_blocks_list)
+    block_table_pad_tokens = 10
+
+    block_tables = []
+    slot_mapping_list = []
+    # Compute uppermost address of block table
+    total_cache_blocks = sum(num_blocks_list)
+    block_base_idx = block_base_addr + total_cache_blocks
+    max_block_idx = block_base_idx
+    for sdx, num_tokens in enumerate(seq_lens):
+        num_blocks = num_blocks_list[sdx]
+        block_table = list(
+            range(block_base_idx, block_base_idx - num_blocks, -1))
+        for idx in range(num_tokens):
+            mapping_value = (
+                idx % block_size) + block_table[idx // block_size] * block_size
+            slot_mapping_list.append(mapping_value)
+
+        block_base_idx -= num_blocks
+        block_tables.append(block_table)
+
+    block_tables_tensor = make_tensor_with_pad(
+        block_tables,
+        max_len=max_block_table_len + block_table_pad_tokens,
+        pad=0,
+        dtype=torch.int,
+        device=device,
+    )
+
+    return (block_tables_tensor, slot_mapping_list, max_block_idx)
+
+
+def make_test_metadata(
+    attn_backend: AttentionBackend,
+    is_prompt: bool,
+    seq_lens: Optional[List[int]],
+    decoder_test_params: Optional[PhaseTestParameters],
+    device: Union[torch.device, str],
+    encoder_test_params: Optional[PhaseTestParameters] = None,
+    cross_test_params: Optional[PhaseTestParameters] = None
+) -> AttentionMetadata:
+    '''
+    Construct fake attention metadata for a given test phase
+    (prefill-phase or decode-phase).
+
+    encoder_test_params and cross_test_params arguments allow encoder
+    attention and enc/dec cross-attention (respectively) to use distinct
+    metadata values from decoder self-attention (decoder_test_params.)
+    
+    if encoder_test_params and cross_test_params are None, the attention
+    metadata will support decoder-only scenario.
+
+    Assumptions:
+
+    * No chunked prefill -> a batch is 100% prefill or 100% decode, never both
+
+    Arguments:
+
+    * attn_backend: Backend for sourcing attention kernels
+    * is_prompt: prefill if True, o/w decode
+    * seq_lens: list of token counts for each sequence
+    * decoder_test_params: decoder self-attention test params; 
+                           this function requires
+                           kv_mmap (memory mapping) field
+    * device: CPU or CUDA device
+    * encoder_test_params: encoder attention test params;
+                           this function requires encoder query
+                           sequence lengths field. If None,
+                           encoder query sequence lengths are
+                           treated as None
+    * cross_test_params: enc/dec cross-attention test params;
+                         this function requires kv_mmap field.
+                         If None, KV cache memory map data
+                         structures are treated as None
+
+    Return:
+
+    * AttentionMetadata structure
+    '''
+
+    # Decoder self-attention memory mapping
+    # decoder_test_params is None signals encoder-only
+    # scenario, so kv_mmap is None
+    kv_mmap = (None
+               if decoder_test_params is None else decoder_test_params.kv_mmap)
+
+    # This function constructs metadata assuming no chunked prefill,
+    # i.e. 100% prefill tokens or 100% decode tokens
+    #
+    # - If is_prompt, num_prefills_or_decodes is the number of prefills
+    #   and num_prefill_or_decode_tokens is the number of prefill tokens
+    # - If not is_prompt, num_prefills_or_decodes is the number of decodes
+    #   and num_prefill_or_decode_tokens is the number of decode tokens
+    #
+    # seq_lens is None signals encoder-only
+    # scenario, in which case num_prefills_or_decodes and
+    # num_prefill_or_decode_tokens are unused
+    num_prefills_or_decodes = (None if seq_lens is None else len(seq_lens))
+
+    num_prefill_or_decode_tokens = (None if seq_lens is None else (
+        sum(seq_lens) if is_prompt else len(seq_lens)))
+
+    # Seems for non-prefix-caching scenarios context_lens
+    # is never needed
+    context_lens = None
+
+    if encoder_test_params is None:
+        encoder_seq_lens = None
+        num_encoder_tokens = None
+    else:
+        # Encoder/decoder or encoder-only models only:
+        # * Extract encoder input sequence lengths
+        assert encoder_test_params.packed_qkvo.packed_qkv is not None
+        encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
+        num_encoder_tokens = (None if encoder_seq_lens is None else
+                              (sum(encoder_seq_lens)))
+
+    if cross_test_params is None:
+        cross_kv_mmap = None
+    else:
+        # Encoder/decoder or encoder-only models only:
+        # * Extract *cross-attention* slot_mapping and block table
+        #   (kv_mmap)
+        cross_kv_mmap = cross_test_params.kv_mmap
+
+    if is_prompt:
+        # Prefill-phase scenario
+
+        num_prefills = num_prefills_or_decodes
+        num_prefill_tokens = num_prefill_or_decode_tokens
+        num_decode_tokens = 0
+
+        (
+            seq_lens_tensor,
+            context_lens_tensor,
+            _,
+            _,
+            _,
+            encoder_seq_lens_tensor,
+            max_encoder_seq_len,
+        ) = _make_metadata_tensors(seq_lens,
+                                   context_lens,
+                                   encoder_seq_lens,
+                                   device=device)
+
+        return attn_backend.make_metadata(
+            num_prefills=num_prefills,
+            slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
+            max_decode_seq_len=0,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=(None if kv_mmap is None else kv_mmap.block_tables),
+            use_cuda_graph=False,
+            num_encoder_tokens=num_encoder_tokens,
+            encoder_seq_lens=encoder_seq_lens,
+            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            max_encoder_seq_len=max_encoder_seq_len,
+            cross_slot_mapping=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.slot_mapping),
+            cross_block_tables=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.block_tables))
+
+    else:  # not is_prompt
+        # Decode-phase scenario
+
+        assert kv_mmap is not None
+        assert num_prefill_or_decode_tokens is not None
+        assert seq_lens is not None
+
+        num_prefills = 0
+        num_prefill_tokens = 0
+        num_decode_tokens = num_prefill_or_decode_tokens
+
+        (
+            seq_lens_tensor,
+            context_lens_tensor,
+            _,
+            _,
+            _,
+            encoder_seq_lens_tensor,
+            max_encoder_seq_len,
+        ) = _make_metadata_tensors(seq_lens,
+                                   context_lens,
+                                   encoder_seq_lens,
+                                   device=device)
+
+        return attn_backend.make_metadata(
+            num_prefills=num_prefills,
+            slot_mapping=kv_mmap.slot_mapping,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=max(seq_lens),
+            context_lens_tensor=context_lens_tensor,
+            block_tables=kv_mmap.block_tables,
+            use_cuda_graph=False,
+            num_encoder_tokens=num_encoder_tokens,
+            encoder_seq_lens=encoder_seq_lens,
+            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+            max_encoder_seq_len=max_encoder_seq_len,
+            cross_slot_mapping=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.slot_mapping),
+            cross_block_tables=(None if cross_kv_mmap is None else
+                                cross_kv_mmap.block_tables))
+
+
+def assert_actual_matches_ideal(test_params: PhaseTestParameters,
+                                output_under_test: torch.Tensor) -> None:
+    '''
+    Assert that observed output matches the ideal output
+    contained in the test parameters data structure.
+
+    Arguments:
+
+    * test_params: Test parameters including packed ideal output
+    * output_under_test: actually observed output value
+    '''
+    ideal_output = test_params.packed_qkvo.ideal_output
+    assert torch.allclose(ideal_output,
+                          output_under_test.view_as(ideal_output))
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 40768532f..adb832516 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,11 +1,18 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, fields
+from enum import Enum, auto
 from typing import (Any, Dict, Generic, List, Optional, Set, Tuple, Type,
                     TypeVar)
 
 import torch
 
 
+class AttentionType(Enum):
+    DECODER = auto()  # Decoder attention between previous layer Q/K/V
+    ENCODER = auto()  # Encoder attention between previous layer Q/K/V
+    ENCODER_DECODER = auto()  # Attention between dec. Q and enc. K/V
+
+
 class AttentionBackend(ABC):
     """Abstract class for attention backends."""
 
@@ -128,5 +135,6 @@ class AttentionImpl(ABC, Generic[T]):
         kv_cache: torch.Tensor,
         attn_metadata: T,
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 7b4578fcd..fe4c4a45d 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type
 import torch
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 from vllm.attention.ops.blocksparse_attention.interface import (
     LocalStridedBlockSparseAttn, get_head_sliding_step)
 from vllm.attention.ops.paged_attn import PagedAttention
@@ -328,6 +328,7 @@ class BlocksparseFlashAttentionImpl(AttentionImpl):
         kv_cache: torch.Tensor,
         attn_metadata: BlocksparseFlashAttentionMetadata,
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -340,6 +341,12 @@ class BlocksparseFlashAttentionImpl(AttentionImpl):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "BlocksparseFlashAttentionImpl")
+
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 8cb5c3101..048abed48 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -7,7 +7,7 @@ from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 
 
 class FlashAttentionBackend(AttentionBackend):
@@ -257,6 +257,7 @@ class FlashAttentionImpl(AttentionImpl):
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -269,6 +270,12 @@ class FlashAttentionImpl(AttentionImpl):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttentionImpl")
+
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert kv_scale == 1.0, "kv_scale is not supported in FlashAttention."
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index a9ab23130..b27e3e40f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -14,7 +14,7 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 
 
 class FlashInferBackend(AttentionBackend):
@@ -224,8 +224,14 @@ class FlashInferImpl(AttentionImpl):
         kv_cache: Optional[torch.Tensor],
         attn_metadata: FlashInferMetadata,
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         assert kv_scale == 1.0
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
         num_tokens, hidden_size = query.shape
         query = query.view(-1, self.num_heads, self.head_size)
         key = key.view(-1, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 5114bfa6e..6a1295b10 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm._ipex_ops import ipex_ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 
@@ -157,6 +157,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
         kv_cache: Optional[torch.Tensor],
         attn_metadata: IpexAttnMetadata,  # type: ignore
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
 
@@ -170,6 +171,11 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
             shape = [num_tokens, num_heads * head_size]
         """
         assert kv_scale == 1.0
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "IpexAttnBackendImpl")
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 22cb1a1bd..7a6954ceb 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -6,7 +6,7 @@ import torch_xla.experimental.custom_kernel  # Required to register custom ops.
 import torch_xla.experimental.dynamo_set_buffer_donor
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 
 
 class PallasAttentionBackend(AttentionBackend):
@@ -132,6 +132,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         kv_cache: Tuple[Optional[torch.Tensor], Optional[torch.Tensor]],
         attn_metadata: PallasMetadata,
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
@@ -146,6 +147,11 @@ class PallasAttentionBackendImpl(AttentionImpl):
             shape = [batch_size, seq_len, num_heads * head_size]
         """
         assert kv_scale == 1.0
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
         batch_size, seq_len, hidden_size = query.shape
         query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
         key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 31ae07514..81b546c65 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -6,7 +6,7 @@ import torch
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -297,6 +297,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         kv_cache: torch.Tensor,
         attn_metadata: ROCmFlashAttentionMetadata,
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -309,6 +310,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "ROCmFlashAttentionImpl")
+
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 63f8466da..48418f248 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -7,7 +7,7 @@ import torch
 from torch.nn.functional import scaled_dot_product_attention
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
 from vllm.utils import is_cpu
 
@@ -145,6 +145,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         kv_cache: Optional[torch.Tensor],
         attn_metadata: TorchSDPAMetadata,  # type: ignore
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
@@ -158,6 +159,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
             shape = [num_tokens, num_heads * head_size]
         """
         assert kv_scale == 1.0
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TorchSDPABackendImpl")
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
new file mode 100644
index 000000000..a3cfc6e20
--- /dev/null
+++ b/vllm/attention/backends/utils.py
@@ -0,0 +1,7 @@
+"""Attention backend utils"""
+
+# Error string(s) for encoder/decoder
+# unsupported attention scenarios
+
+STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported "
+                                 "with encoder/decoder models.")
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index ff449c3ff..6cc5f1d14 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -6,10 +6,11 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import (AttentionBias,
                                          BlockDiagonalCausalMask,
+                                         BlockDiagonalMask,
                                          LowerTriangularMaskWithTensorBias)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -66,11 +67,6 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     dynamically, it should be stored in tensor. The tensor has to be
     updated from `CUDAGraphRunner.forward` API.
     """
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]]
-    # seq_lens stored as a tensor.
-    seq_lens_tensor: Optional[torch.Tensor]
 
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
@@ -79,8 +75,9 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     # |-------------------- seq_len ----------------------|
     #                                   |-- query_len ---|
 
-    # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
     # FIXME: It is for flash attn.
     # Maximum sequence length among prefill batch. 0 if there are decoding
     # requests only.
@@ -88,26 +85,55 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
     # Maximum sequence length among decode batch. 0 if there are prefill
     # requests only.
     max_decode_seq_len: int
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]] = None
+
     # FIXME: It is for flash attn.
     # (batch_size + 1,). The cumulative sequence lengths of the sequences in
     # the batch, used to index into sequence. E.g., if the sequence length is
     # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
+    seq_start_loc: Optional[torch.Tensor] = None
+
     # (batch_size,) A tensor of context lengths (tokens that are computed
     # so far).
-    context_lens_tensor: Optional[torch.Tensor]
+    context_lens_tensor: Optional[torch.Tensor] = None
 
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
+    # Maximum query length in the batch. None for decoding.
+    max_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+
+    # Self-attention prefill/decode metadata cache
     _cached_prefill_metadata: Optional["XFormersMetadata"] = None
     _cached_decode_metadata: Optional["XFormersMetadata"] = None
 
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
     def __post_init__(self):
         # Set during the execution of the first attention op.
         # It is a list because it is needed to set per prompt
@@ -115,6 +141,28 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
         # from xformer API.
         # will not appear in the __repr__ and __init__
         self.attn_bias: Optional[List[AttentionBias]] = None
+        self.encoder_attn_bias: Optional[List[AttentionBias]] = None
+        self.cross_attn_bias: Optional[List[AttentionBias]] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return ((self.encoder_seq_lens is not None)
+                and (self.encoder_seq_lens_tensor is not None)
+                and (self.max_encoder_seq_len is not None))
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return (self.is_all_encoder_attn_metadata_set
+                and (self.cross_slot_mapping is not None)
+                and (self.cross_block_tables is not None))
 
     @property
     def prefill_metadata(self) -> Optional["XFormersMetadata"]:
@@ -122,30 +170,50 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
             return None
 
         if self._cached_prefill_metadata is not None:
+            # Recover cached prefill-phase attention
+            # metadata structure
             return self._cached_prefill_metadata
 
-        assert self.seq_lens is not None
-        assert self.seq_lens_tensor is not None
-        assert self.query_start_loc is not None
-        assert self.context_lens_tensor is not None
-        assert self.block_tables is not None
-
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+
+        # Construct & cache prefill-phase attention metadata structure
         self._cached_prefill_metadata = XFormersMetadata(
             num_prefills=self.num_prefills,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=0,
-            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
-            seq_lens=self.seq_lens[:self.num_prefills],
-            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
-            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=None,
-            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
-            block_tables=self.block_tables[:self.num_prefills],
+            query_start_loc=query_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
             use_cuda_graph=False,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_prefill_metadata
 
     @property
@@ -154,29 +222,146 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
             return None
 
         if self._cached_decode_metadata is not None:
+            # Recover cached decode-phase attention
+            # metadata structure
             return self._cached_decode_metadata
-        assert self.block_tables is not None
-        assert self.seq_lens_tensor is not None
-
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+
+        # Construct & cache decode-phase attention metadata structure
         self._cached_decode_metadata = XFormersMetadata(
             num_prefills=0,
             num_prefill_tokens=0,
             num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
-            seq_lens=None,
-            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
-            max_query_len=None,
+            slot_mapping=slot_mapping,
+            seq_lens_tensor=seq_lens_tensor,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
-            query_start_loc=None,
-            seq_start_loc=None,
-            context_lens_tensor=None,
-            block_tables=self.block_tables[self.num_prefills:],
+            block_tables=block_tables,
             use_cuda_graph=self.use_cuda_graph,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_decode_metadata
 
 
+def _get_attn_bias(
+    attn_metadata: XFormersMetadata,
+    attn_type: AttentionType,
+) -> Optional[AttentionBias]:
+    '''
+    Extract appropriate attention bias from attention metadata
+    according to attention type.
+
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+
+    Returns:
+    * Appropriate attention bias value given the attention type
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        return attn_metadata.attn_bias
+    elif attn_type == AttentionType.ENCODER:
+        return attn_metadata.encoder_attn_bias
+    else:
+        # attn_type == AttentionType.ENCODER_DECODER
+        return attn_metadata.cross_attn_bias
+
+
+def _set_attn_bias(
+    attn_metadata: XFormersMetadata,
+    attn_bias: List[Optional[AttentionBias]],
+    attn_type: AttentionType,
+) -> None:
+    '''
+    Update appropriate attention bias field of attention metadata,
+    according to attention type.
+
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention
+    * attn_bias: The desired attention bias value
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        attn_metadata.attn_bias = attn_bias
+    elif attn_type == AttentionType.ENCODER:
+        attn_metadata.encoder_attn_bias = attn_bias
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        attn_metadata.cross_attn_bias = attn_bias
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def _get_seq_len_block_table_args(
+    attn_metadata: XFormersMetadata,
+    is_prompt: bool,
+    attn_type: AttentionType,
+) -> tuple:
+    '''
+    The particular choice of sequence-length- and block-table-related
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths & 
+                                  cross-attn block-tables fields
+    Encoder attn -> select encoder sequence lengths fields & no block tables
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * is_prompt: True if prefill, False otherwise
+    * attn_type: encoder attention, decoder self-attention,
+                 encoder/decoder cross-attention
+
+    Returns:
+
+    * Appropriate sequence-lengths tensor
+    * Appropriate max sequence-length scalar
+    * Appropriate block tables (or None)
+    '''
+
+    if attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        if is_prompt:
+            max_seq_len = attn_metadata.max_prefill_seq_len
+        else:
+            max_seq_len = attn_metadata.max_decode_seq_len
+        return (attn_metadata.seq_lens_tensor, max_seq_len,
+                attn_metadata.block_tables)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len,
+                attn_metadata.cross_block_tables)
+    elif attn_type == AttentionType.ENCODER:
+        # No block tables associated with encoder attention
+        return (attn_metadata.encoder_seq_lens_tensor,
+                attn_metadata.max_encoder_seq_len, None)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
 class XFormersImpl(AttentionImpl[XFormersMetadata]):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -238,51 +423,144 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
     def forward(
         self,
         query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
         kv_cache: Optional[torch.Tensor],
         attn_metadata: "XFormersMetadata",
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
+        For decoder-only models: query, key and value must be non-None.
+
+        For encoder/decoder models:
+        * XFormersImpl.forward() may be invoked for both self- and cross-
+          attention layers.
+        * For self-attention: query, key and value must be non-None.
+        * For cross-attention:
+            * Query must be non-None
+            * During prefill, key and value must be non-None; key and value
+              get cached for use during decode.
+            * During decode, key and value may be None, since:
+              (1) key and value tensors were cached during prefill, and
+              (2) cross-attention key and value tensors do not grow during
+                  decode
+        
+        A note on how the attn_type (attention type enum) argument impacts
+        attention forward() behavior:
+    
+            * DECODER: normal decoder-only behavior;
+                use decoder self-attention block table
+            * ENCODER: no KV caching; pass encoder sequence
+                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len) to kernel, in lieu of decoder
+                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len)
+            * ENCODER_DECODER: cross-attention behavior;
+                use cross-attention block table for caching KVs derived
+                from encoder hidden states; since KV sequence lengths
+                will match encoder sequence lengths, pass encoder sequence
+                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len)
+    
         Args:
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
             kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
             attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
 
-        if kv_cache is not None:
+        # Check that appropriate attention metadata attributes are
+        # selected for the desired attention type
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
+
+        # Self-attention vs. cross-attention will impact
+        # which KV cache memory-mapping & which
+        # seqlen datastructures we utilize
+
+        if (attn_type != AttentionType.ENCODER and kv_cache is not None):
+            # KV-cache during decoder-self- or
+            # encoder-decoder-cross-attention, but not
+            # during encoder attention.
+            #
+            # Even if there are no new key/value pairs to cache,
+            # we still need to break out key_cache and value_cache
+            # i.e. for later use by paged attention
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
-            # Reshape the input keys and values and store them in the cache.
-            # If kv_cache is not provided, the new key and value tensors are
-            # not cached. This happens during the initial memory profiling run.
-            PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                value_cache,
-                                                attn_metadata.slot_mapping,
-                                                self.kv_cache_dtype, kv_scale)
-
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+            if (key is not None) and (value is not None):
+
+                if attn_type == AttentionType.ENCODER_DECODER:
+                    # Update cross-attention KV cache (prefill-only)
+                    # During cross-attention decode, key & value will be None,
+                    # preventing this IF-statement branch from running
+                    updated_slot_mapping = attn_metadata.cross_slot_mapping
+                else:
+                    # Update self-attention KV cache (prefill/decode)
+                    updated_slot_mapping = attn_metadata.slot_mapping
+
+                # Reshape the input keys and values and store them in the cache.
+                # If kv_cache is not provided, the new key and value tensors are
+                # not cached. This happens during the initial memory
+                # profiling run.
+                PagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                    value_cache,
+                                                    updated_slot_mapping,
+                                                    self.kv_cache_dtype,
+                                                    kv_scale)
+
+        if attn_type != AttentionType.ENCODER:
+            # Decoder self-attention supports chunked prefill.
+            # Encoder/decoder cross-attention requires no chunked
+            # prefill (100% prefill or 100% decode tokens, no mix)
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+            num_decode_tokens = attn_metadata.num_decode_tokens
+        else:
+            # Encoder attention - chunked prefill is not applicable;
+            # derive token-count from query shape & and treat them
+            # as 100% prefill tokens
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
+            num_decode_tokens = 0
+
+        if attn_type == AttentionType.DECODER:
+            # Only enforce this shape-constraint for decoder
+            # self-attention
+            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
 
         output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
         query = query[:num_prefill_tokens]
-        key = key[:num_prefill_tokens]
-        value = value[:num_prefill_tokens]
+        if key is not None and value is not None:
+            key = key[:num_prefill_tokens]
+            value = value[:num_prefill_tokens]
 
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens
@@ -294,10 +572,14 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
                 # block tables are empty if the prompt does not have a cached
                 # prefix.
                 out = self._run_memory_efficient_xformers_forward(
-                    query, key, value, prefill_meta)
+                    query, key, value, prefill_meta, attn_type=attn_type)
                 assert out.shape == output[:num_prefill_tokens].shape
                 output[:num_prefill_tokens] = out
             else:
+
+                assert prefill_meta.query_start_loc is not None
+                assert prefill_meta.max_query_len is not None
+
                 # prefix-enabled attention
                 # TODO(Hai) this triton kernel has regression issue (broke) to
                 # deal with different data types between KV and FP8 KV cache,
@@ -320,13 +602,20 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
                 output[:num_prefill_tokens] = out
 
         if decode_meta := attn_metadata.decode_metadata:
+
+            (
+                seq_lens_arg,
+                max_seq_len_arg,
+                block_tables_arg,
+            ) = _get_seq_len_block_table_args(decode_meta, False, attn_type)
+
             output[num_prefill_tokens:] = PagedAttention.forward_decode(
                 decode_query,
                 key_cache,
                 value_cache,
-                decode_meta.block_tables,
-                decode_meta.seq_lens_tensor,
-                decode_meta.max_decode_seq_len,
+                block_tables_arg,
+                seq_lens_arg,
+                max_seq_len_arg,
                 self.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
@@ -343,6 +632,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         key: torch.Tensor,
         value: torch.Tensor,
         attn_metadata: XFormersMetadata,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Attention for 1D query of multiple prompts. Multiple prompt
         tokens are flattened in to `query` input.
@@ -356,8 +646,12 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
             key: shape = [num_prefill_tokens, num_kv_heads, head_size]
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
         """
-        assert attn_metadata.seq_lens is not None
+
         original_query = query
         if self.num_kv_heads != self.num_heads:
             # GQA/MQA requires the shape [B, M, G, H, K].
@@ -375,18 +669,39 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         # Set attention bias if not provided. This typically happens at
         # the very attention layer of every iteration.
         # FIXME(woosuk): This is a hack.
-        if attn_metadata.attn_bias is None:
+        attn_bias = _get_attn_bias(attn_metadata, attn_type)
+        if attn_bias is None:
             if self.alibi_slopes is None:
-                attn_bias = BlockDiagonalCausalMask.from_seqlens(
-                    attn_metadata.seq_lens)
+                if (attn_type == AttentionType.ENCODER_DECODER):
+                    assert attn_metadata.seq_lens is not None
+                    assert attn_metadata.encoder_seq_lens is not None
+
+                    # Default enc/dec cross-attention mask is non-causal
+                    attn_bias = BlockDiagonalMask.from_seqlens(
+                        attn_metadata.seq_lens, attn_metadata.encoder_seq_lens)
+                elif attn_type == AttentionType.ENCODER:
+                    assert attn_metadata.encoder_seq_lens is not None
+
+                    # Default encoder self-attention mask is non-causal
+                    attn_bias = BlockDiagonalMask.from_seqlens(
+                        attn_metadata.encoder_seq_lens)
+                else:
+                    assert attn_metadata.seq_lens is not None
+
+                    # Default decoder self-attention mask is causal
+                    attn_bias = BlockDiagonalCausalMask.from_seqlens(
+                        attn_metadata.seq_lens)
                 if self.sliding_window is not None:
                     attn_bias = attn_bias.make_local_attention(
                         self.sliding_window)
-                attn_metadata.attn_bias = [attn_bias]
+                attn_bias = [attn_bias]
             else:
-                attn_metadata.attn_bias = _make_alibi_bias(
-                    self.alibi_slopes, self.num_kv_heads, query.dtype,
-                    attn_metadata.seq_lens)
+                assert attn_metadata.seq_lens is not None
+                attn_bias = _make_alibi_bias(self.alibi_slopes,
+                                             self.num_kv_heads, query.dtype,
+                                             attn_metadata.seq_lens)
+
+            _set_attn_bias(attn_metadata, attn_bias, attn_type)
 
         # No alibi slopes.
         # TODO(woosuk): Too many view operations. Let's try to reduce
@@ -400,7 +715,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
                 query,
                 key,
                 value,
-                attn_bias=attn_metadata.attn_bias[0],
+                attn_bias=attn_bias[0],
                 p=0.0,
                 scale=self.scale)
             return out.view_as(original_query)
@@ -409,6 +724,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         # FIXME(woosuk): Because xformers does not support dynamic sequence
         # lengths with custom attention bias, we process each prompt one by
         # one. This is inefficient, especially when we have many short prompts.
+        assert attn_metadata.seq_lens is not None
         output = torch.empty_like(original_query)
         start = 0
         for i, seq_len in enumerate(attn_metadata.seq_lens):
@@ -417,7 +733,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
                 query[None, start:end],
                 key[None, start:end],
                 value[None, start:end],
-                attn_bias=attn_metadata.attn_bias[i],
+                attn_bias=attn_bias[i],
                 p=0.0,
                 scale=self.scale)
             # TODO(woosuk): Unnecessary copy. Optimize.
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index dfe93be46..b8cc87be8 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional
 import torch
 import torch.nn as nn
 
-from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata, AttentionType
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
 from vllm.model_executor.layers.quantization.base_config import (
@@ -90,9 +90,16 @@ class Attention(nn.Module):
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
-        return self.impl.forward(query, key, value, kv_cache, attn_metadata,
-                                 self._kv_scale)
+
+        return self.impl.forward(query,
+                                 key,
+                                 value,
+                                 kv_cache,
+                                 attn_metadata,
+                                 self._kv_scale,
+                                 attn_type=attn_type)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
-- 
GitLab


From 185ad31f37541ac205b55f446bfd71542f83075a Mon Sep 17 00:00:00 2001
From: Eric <ericperfectttt@gmail.com>
Date: Tue, 9 Jul 2024 02:23:24 +0800
Subject: [PATCH 286/376] [Bugfix] use diskcache in outlines _get_guide  #5436 
 (#6203)

---
 .../guided_decoding/outlines_logits_processors.py            | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 1618705ff..1c8f6cccb 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,6 +21,7 @@ from functools import lru_cache
 from typing import Callable, DefaultDict, Dict, List, Union
 
 import torch
+from outlines.caching import cache
 from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
 from outlines.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
@@ -67,7 +68,7 @@ class BaseLogitsProcessor:
 class RegexLogitsProcessor(BaseLogitsProcessor):
 
     @classmethod
-    @lru_cache(maxsize=32)
+    @cache()
     def _get_guide(cls, regex_string: str,
                    tokenizer: PreTrainedTokenizerBase) -> Guide:
         tokenizer = _adapt_tokenizer(tokenizer)
@@ -126,7 +127,7 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
 class CFGLogitsProcessor(BaseLogitsProcessor):
 
     @classmethod
-    @lru_cache(maxsize=32)
+    @cache()
     def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
         tokenizer = _adapt_tokenizer(tokenizer)
         return CFGGuide(cfg, tokenizer)
-- 
GitLab


From ddc369fba147046f5044aaddbb867b5333f7068c Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Mon, 8 Jul 2024 21:25:51 +0300
Subject: [PATCH 287/376] [Bugfix] Mamba cache Cuda Graph padding (#6214)

---
 tests/models/test_jamba.py          | 28 ++++++++++++++++++++++++++++
 vllm/model_executor/models/jamba.py |  4 ++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index d7e3a2fc4..0a5fe19f8 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -1,5 +1,7 @@
 import pytest
 
+from vllm.worker.model_runner import _get_graph_batch_size
+
 MODELS = ["ai21labs/Jamba-tiny-random"]
 
 
@@ -32,6 +34,32 @@ def test_models(
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_mamba_cache_cg_padding(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # This test is for verifying that mamba cache is padded to CG captured
+    # batch size. If it's not, a torch RuntimeError will be raised because
+    # tensor dimensions aren't compatible
+    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+        example_prompts.append(example_prompts[0])
+
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            vllm_model.generate_greedy(example_prompts, max_tokens)
+    except RuntimeError:
+        pytest.fail(
+            "Couldn't run batch size which is not equal to a Cuda Graph "
+            "captured batch size. "
+            "Could be related to mamba cache not padded correctly")
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 def test_state_cleanup(
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index bf330c777..4524d8df8 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -788,12 +788,12 @@ class JambaForCausalLM(nn.Module):
             key in kwargs
             for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
         request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-        batch_size = len(request_ids_to_seq_ids)
+        cg_batch_size = input_buffers['input_ids'].shape[0]
         (
             current_mamba_cache,
             indices,
         ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
-                                                  batch_size)
+                                                  cg_batch_size)
         self.current_indices = indices
         finished_requests_ids = kwargs["finished_requests_ids"]
         self._release_mamba_cache(finished_requests_ids)
-- 
GitLab


From 4f0e0ea131ef40654faa26fa21196031754df53a Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 8 Jul 2024 13:38:03 -0700
Subject: [PATCH 288/376] Add FlashInfer to default Dockerfile (#6172)

---
 Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index feb004513..67198e8fd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -167,6 +167,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
 
 
-- 
GitLab


From a3c9435d93fb7609977da5d90f839b9987c8b264 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 8 Jul 2024 20:02:15 -0700
Subject: [PATCH 289/376] [hardware][cuda] use device id under
 CUDA_VISIBLE_DEVICES for get_device_capability (#6216)

---
 vllm/platforms/cuda.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index b2ca75813..2d482010c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -2,6 +2,7 @@
 pynvml. However, it should not initialize cuda context.
 """
 
+import os
 from functools import lru_cache, wraps
 from typing import Tuple
 
@@ -23,12 +24,27 @@ def with_nvml_context(fn):
     return wrapper
 
 
+@lru_cache(maxsize=8)
+@with_nvml_context
+def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        device_ids = [int(device_id) for device_id in device_ids]
+        physical_device_id = device_ids[device_id]
+    else:
+        physical_device_id = device_id
+    return physical_device_id
+
+
 class CudaPlatform(Platform):
     _enum = PlatformEnum.CUDA
 
     @staticmethod
-    @lru_cache(maxsize=8)
-    @with_nvml_context
     def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
-        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-        return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        return get_physical_device_capability(physical_device_id)
-- 
GitLab


From 70c232f85a9e83421a4d9ca95e6384364271f2bc Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 8 Jul 2024 21:31:44 -0700
Subject: [PATCH 290/376] [core][distributed] fix ray worker rank assignment
 (#6235)

---
 vllm/executor/ray_gpu_executor.py | 34 +++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index bc7ef9cc7..6e13264ab 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -134,11 +134,32 @@ class RayGPUExecutor(DistributedGPUExecutor):
         worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
                                                     use_dummy_driver=True)
 
-        node_workers = defaultdict(list)
-        node_gpus = defaultdict(list)
-
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
+        # the order in `worker_node_and_gpu_ids` does not necessarily match
+        # the machine boundaries. We need to make sure that workers in the
+        # same node are assigned consecutive ranks.
+        # examples:
+        # [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa
+
+        # initialize worker ranks with -1 (unassigned)
+        worker_ranks = [-1 for x in worker_node_and_gpu_ids]
+        current_rank = 0
+        while -1 in worker_ranks:
+            # whenever we find an unassigned worker, find the node
+            index = worker_ranks.index(-1)
+            current_node_id = worker_node_and_gpu_ids[index][0]
+            # assign ranks to all workers in the same node
+            for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+                if node_id == current_node_id:
+                    worker_ranks[i] = current_rank
+                    current_rank += 1
+        # with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3]
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for worker_rank, (node_id, gpu_ids) in zip(worker_ranks,
+                                                   worker_node_and_gpu_ids):
+            node_workers[node_id].append(worker_rank)
             # `gpu_ids` can be a list of strings or integers.
             # convert them to integers for consistency.
             # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
@@ -184,7 +205,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
                 local_rank=node_workers[node_id].index(rank),
                 rank=rank,
                 distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+            ) for rank, (node_id,
+                         _) in zip(worker_ranks, worker_node_and_gpu_ids)
         ]
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
-- 
GitLab


From 5d5b4c5fe524c3b62453bba7ad4434a27c81317a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 9 Jul 2024 00:21:37 -0700
Subject: [PATCH 291/376] [Bugfix][TPU] Add missing None to model input (#6245)

---
 vllm/worker/tpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index e4a96c073..9b00a60ac 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -154,7 +154,7 @@ class TPUModelRunner:
         # Dummy run.
         num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
         self.model(token_ids, position_ids, kv_caches, attn_metadata,
-                   input_lens, t, p, num_samples)
+                   input_lens, None, t, p, num_samples)
 
     def warmup_model(
         self,
-- 
GitLab


From 08c5bdecae5c5186c39a1d1ff444c3bf01f7fa0e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 9 Jul 2024 02:56:06 -0700
Subject: [PATCH 292/376] [Bugfix][TPU] Fix outlines installation in TPU
 Dockerfile (#6256)

---
 Dockerfile.tpu | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 931c844c0..23bb78682 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -2,11 +2,8 @@ ARG NIGHTLY_DATE="20240601"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
-
 WORKDIR /workspace
-COPY . /workspace/vllm
 
-ENV VLLM_TARGET_DEVICE="tpu"
 # Install aiohttp separately to avoid build errors.
 RUN pip install aiohttp
 # Install the TPU and Pallas dependencies.
@@ -14,6 +11,13 @@ RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases
 RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 
 # Build vLLM.
+COPY . /workspace/vllm
+ENV VLLM_TARGET_DEVICE="tpu"
 RUN cd /workspace/vllm && python setup.py develop
 
+# Re-install outlines to avoid dependency errors.
+# The outlines version must follow requirements-common.txt.
+RUN pip uninstall outlines -y
+RUN pip install "outlines>=0.0.43"
+
 CMD ["/bin/bash"]
-- 
GitLab


From a0550cbc80f504aa2da80b573c22204f686a0389 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 9 Jul 2024 12:56:56 -0700
Subject: [PATCH 293/376] Add support for multi-node on CI (#5955)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/run-multi-node-test.sh | 77 +++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100755 .buildkite/run-multi-node-test.sh

diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
new file mode 100755
index 000000000..0d94b2555
--- /dev/null
+++ b/.buildkite/run-multi-node-test.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+set -euox pipefail
+
+if [[ $# -lt 3 ]]; then
+    echo "Please provide the number of nodes and GPU per node."
+    exit 1
+fi
+
+NUM_NODES=$1
+NUM_GPUS=$2
+DOCKER_IMAGE=$3
+
+shift 3
+COMMANDS=("$@")
+if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+    echo "The number of commands must be equal to the number of nodes."
+    echo "Number of nodes: $NUM_NODES"
+    echo "Number of commands: ${#COMMANDS[@]}"
+    exit 1
+fi
+
+echo "List of commands"
+for command in "${COMMANDS[@]}"; do
+    echo $command
+done
+
+start_network() {
+    docker network create --subnet=192.168.10.0/24 docker-net
+}
+
+start_nodes() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+        # echo "Starting node$node with GPU devices: $GPU_DEVICES"
+        docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
+    done
+}
+
+run_nodes() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+        echo "Running node$node with GPU devices: $GPU_DEVICES"
+        if [ $node -lt $(($NUM_NODES - 1)) ]; then
+            docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
+        else
+            docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
+        fi
+    done
+}
+cleanup() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        docker stop node$node
+    done
+    docker network rm docker-net
+}
+trap cleanup EXIT
+start_network
+start_nodes
+run_nodes
+
-- 
GitLab


From 4d6ada947c7e6379b6857bc9a9a1203679d32039 Mon Sep 17 00:00:00 2001
From: Swapnil Parekh <swapnilbp100@gmail.com>
Date: Tue, 9 Jul 2024 16:26:36 -0400
Subject: [PATCH 294/376] [CORE] Adding support for insertion of soft-tuned
 prompts (#4645)

Co-authored-by: Swapnil Parekh <swapnilp@ibm.com>
Co-authored-by: Joe G <joseph.granados@h2o.ai>
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
---
 format.sh                                     |   1 +
 tests/lora/test_long_context.py               |   9 +-
 tests/lora/test_lora_manager.py               | 326 ++++++++--------
 tests/prompt_adapter/test_bloom.py            |  45 +++
 .../test_multi_adapter_inference.py           |  53 +++
 tests/prompt_adapter/test_pa_lora.py          |  61 +++
 tests/spec_decode/e2e/conftest.py             |   2 +
 tests/worker/test_model_runner.py             |   1 +
 vllm/adapter_commons/__init__.py              |   0
 vllm/adapter_commons/layers.py                |  14 +
 vllm/adapter_commons/models.py                | 104 +++++
 vllm/adapter_commons/request.py               |  25 ++
 vllm/adapter_commons/utils.py                 |  90 +++++
 vllm/adapter_commons/worker_manager.py        |  36 ++
 vllm/config.py                                |  37 ++
 vllm/core/scheduler.py                        |  12 +
 vllm/engine/arg_utils.py                      |  24 +-
 vllm/engine/async_llm_engine.py               |  38 +-
 vllm/engine/llm_engine.py                     |  65 +++-
 vllm/entrypoints/llm.py                       |  33 +-
 vllm/entrypoints/openai/api_server.py         |   5 +-
 vllm/entrypoints/openai/cli_args.py           |  21 +-
 vllm/entrypoints/openai/serving_chat.py       |   2 +-
 vllm/entrypoints/openai/serving_completion.py |  17 +-
 vllm/entrypoints/openai/serving_engine.py     |  61 ++-
 vllm/executor/cpu_executor.py                 |  15 +
 vllm/executor/executor_base.py                |  27 +-
 vllm/executor/gpu_executor.py                 |  21 ++
 vllm/executor/ray_xpu_executor.py             |   5 +-
 vllm/executor/xpu_executor.py                 |   5 +-
 vllm/lora/layers.py                           |  12 +-
 vllm/lora/models.py                           | 175 ++++-----
 vllm/lora/request.py                          |  25 +-
 vllm/lora/worker_manager.py                   | 215 ++++-------
 vllm/prompt_adapter/__init__.py               |   0
 vllm/prompt_adapter/layers.py                 |  80 ++++
 vllm/prompt_adapter/models.py                 | 355 ++++++++++++++++++
 vllm/prompt_adapter/request.py                |  30 ++
 vllm/prompt_adapter/worker_manager.py         | 176 +++++++++
 vllm/sequence.py                              |  48 ++-
 vllm/spec_decode/draft_model_runner.py        |  11 +-
 vllm/worker/cpu_model_runner.py               |   4 +-
 vllm/worker/cpu_worker.py                     |   5 +-
 vllm/worker/embedding_model_runner.py         |  11 +-
 vllm/worker/model_runner.py                   | 138 ++++++-
 vllm/worker/worker.py                         |  20 +-
 vllm/worker/xpu_model_runner.py               |   4 +-
 vllm/worker/xpu_worker.py                     |   5 +-
 48 files changed, 1951 insertions(+), 518 deletions(-)
 create mode 100644 tests/prompt_adapter/test_bloom.py
 create mode 100644 tests/prompt_adapter/test_multi_adapter_inference.py
 create mode 100644 tests/prompt_adapter/test_pa_lora.py
 create mode 100644 vllm/adapter_commons/__init__.py
 create mode 100644 vllm/adapter_commons/layers.py
 create mode 100644 vllm/adapter_commons/models.py
 create mode 100644 vllm/adapter_commons/request.py
 create mode 100644 vllm/adapter_commons/utils.py
 create mode 100644 vllm/adapter_commons/worker_manager.py
 create mode 100644 vllm/prompt_adapter/__init__.py
 create mode 100644 vllm/prompt_adapter/layers.py
 create mode 100644 vllm/prompt_adapter/models.py
 create mode 100644 vllm/prompt_adapter/request.py
 create mode 100644 vllm/prompt_adapter/worker_manager.py

diff --git a/format.sh b/format.sh
index 8c54b5630..5edc868f9 100755
--- a/format.sh
+++ b/format.sh
@@ -111,6 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/model_executor  --config-file pyproject.toml
 mypy vllm/lora --config-file pyproject.toml
 mypy vllm/logging --config-file pyproject.toml
+mypy vllm/prompt_adapter --config-file pyproject.toml
 mypy tests --config-file pyproject.toml
 
 
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index b50784a20..853fd9fb3 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -92,11 +92,10 @@ def batched_generate(
     for input in inputs:
         prompt, sampling_param, lora_req = input
         # Add requests to the engine and run the engine
-        llm._validate_and_add_requests(
-            prompt,
-            sampling_param,
-            lora_request=lora_req,
-        )
+        llm._validate_and_add_requests(prompt,
+                                       sampling_param,
+                                       lora_request=lora_req,
+                                       prompt_adapter_request=None)
 
     outputs = llm._run_engine(use_tqdm=True)
     return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 2133bce14..7bff9e1fb 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -127,37 +127,37 @@ def test_lora_model_manager(dist_init, dummy_model):
         model, 2, 2, 2,
         LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
     assert all(x is None for x in manager.lora_index_to_id)
-    assert manager.add_lora(model_lora1)
-    assert manager.activate_lora(1)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
     assert manager.lora_index_to_id[0] == 1
-    assert not manager.add_lora(model_lora1)
-    assert not manager.activate_lora(1)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(2)
+    assert not manager.add_adapter(model_lora1)
+    assert not manager.activate_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
-    assert not manager.add_lora(model_lora2)
-    assert not manager.activate_lora(2)
-    assert manager.add_lora(model_lora3)
+    assert not manager.add_adapter(model_lora2)
+    assert not manager.activate_adapter(2)
+    assert manager.add_adapter(model_lora3)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
     with pytest.raises(ValueError):
-        assert manager.activate_lora(3)
+        assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
-    assert manager.remove_lora(model_lora2.id)
+    assert manager.remove_adapter(model_lora2.id)
     assert manager.lora_index_to_id[1] is None
-    assert not manager.remove_lora(model_lora2.id)
-    assert manager.remove_lora(model_lora1.id)
-    assert not manager.remove_lora(model_lora1.id)
-    assert manager.add_lora(model_lora1)
+    assert not manager.remove_adapter(model_lora2.id)
+    assert manager.remove_adapter(model_lora1.id)
+    assert not manager.remove_adapter(model_lora1.id)
+    assert manager.add_adapter(model_lora1)
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] is None
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(3)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] is None
-    assert manager.activate_lora(2)
+    assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 2
 
@@ -173,70 +173,70 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model):
         model, 2, 2, 2,
         LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
     assert all(x is None for x in manager.lora_index_to_id)
-    assert manager.add_lora(model_lora1)
-    assert manager.activate_lora(1)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
     assert manager.lora_index_to_id[0] == 1
-    assert not manager.add_lora(model_lora1)
-    assert not manager.activate_lora(1)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(2)
+    assert not manager.add_adapter(model_lora1)
+    assert not manager.activate_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
-    assert not manager.add_lora(model_lora2)
-    assert not manager.activate_lora(2)
-    assert manager.add_lora(model_lora3)
+    assert not manager.add_adapter(model_lora2)
+    assert not manager.activate_adapter(2)
+    assert manager.add_adapter(model_lora3)
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
-    assert manager.activate_lora(3)
+    assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 2
-    assert manager.remove_lora(model_lora2.id)
+    assert manager.remove_adapter(model_lora2.id)
     assert manager.lora_index_to_id[1] is None
-    assert not manager.remove_lora(model_lora2.id)
-    assert manager.remove_lora(model_lora1.id)
-    assert not manager.remove_lora(model_lora1.id)
-    assert manager.add_lora(model_lora1)
-    assert manager.activate_lora(1)
+    assert not manager.remove_adapter(model_lora2.id)
+    assert manager.remove_adapter(model_lora1.id)
+    assert not manager.remove_adapter(model_lora1.id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 1
-    assert manager.add_lora(model_lora2)
-    assert manager.deactivate_lora(3)
+    assert manager.add_adapter(model_lora2)
+    assert manager.deactivate_adapter(3)
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] == 1
-    assert manager.activate_lora(2)
+    assert manager.activate_adapter(2)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 1
-    assert manager.activate_lora(3)
+    assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 3
-    assert manager.pin_lora(2)
+    assert manager.pin_adapter(2)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 3
-    assert manager.activate_lora(1)
+    assert manager.activate_adapter(1)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 1
-    assert manager.deactivate_lora(2)
+    assert manager.deactivate_adapter(2)
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] == 1
-    assert manager.activate_lora(3)
+    assert manager.activate_adapter(3)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 1
-    assert manager.pin_lora(3)
-    assert manager.pin_lora(1)
+    assert manager.pin_adapter(3)
+    assert manager.pin_adapter(1)
     with pytest.raises(RuntimeError):
-        assert manager.pin_lora(2)
+        assert manager.pin_adapter(2)
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 1
     with pytest.raises(RuntimeError):
-        assert manager.activate_lora(2)
+        assert manager.activate_adapter(2)
 
-    assert manager.deactivate_lora(3)
-    assert manager.pin_lora(2)
+    assert manager.deactivate_adapter(3)
+    assert manager.pin_adapter(2)
     assert manager.lora_index_to_id[0] == 2
     assert manager.lora_index_to_id[1] == 1
-    assert manager.remove_lora(3)
+    assert manager.remove_adapter(3)
     with pytest.raises(ValueError):
-        assert manager.pin_lora(3)
+        assert manager.pin_adapter(3)
 
 
 def test_lru_lora_model_manager(dist_init, dummy_model):
@@ -256,168 +256,169 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
     assert all(x is None for x in manager.lora_index_to_id)
 
     # Add up to capacity
-    assert manager.add_lora(model_lora1)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(1)
-    assert manager.activate_lora(2)
+    assert manager.add_adapter(model_lora1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(1)
+    assert manager.activate_adapter(2)
 
-    assert set(manager.list_loras()) == {1, 2}
+    assert set(manager.list_adapters()) == {1, 2}
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
 
     # Add over capacity
-    assert manager.add_lora(model_lora3)
-    assert manager.add_lora(model_lora4)
-    assert manager.activate_lora(3)
-    assert manager.activate_lora(4)
+    assert manager.add_adapter(model_lora3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(3)
+    assert manager.activate_adapter(4)
 
-    assert set(manager.list_loras()) == {3, 4}
+    assert set(manager.list_adapters()) == {3, 4}
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 4
 
     # Add 3 again to move it to the top and then add 2
     # should return false since it's in already
-    assert not manager.add_lora(model_lora3)
-    assert not manager.activate_lora(3)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(2)
+    assert not manager.add_adapter(model_lora3)
+    assert not manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
 
-    assert set(manager.list_loras()) == {3, 2}
+    assert set(manager.list_adapters()) == {3, 2}
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 2
 
     # Remove manually
-    assert manager.remove_lora(3)
-    assert not manager.remove_lora(3)
+    assert manager.remove_adapter(3)
+    assert not manager.remove_adapter(3)
 
-    assert set(manager.list_loras()) == {2}
+    assert set(manager.list_adapters()) == {2}
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] == 2
 
-    assert manager.add_lora(model_lora3)
-    assert manager.activate_lora(3)
-    assert manager.add_lora(model_lora4)
-    assert manager.activate_lora(4)
+    assert manager.add_adapter(model_lora3)
+    assert manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(4)
 
-    assert set(manager.list_loras()) == {3, 4}
+    assert set(manager.list_adapters()) == {3, 4}
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 4
 
-    assert manager.remove_oldest_lora()
-    assert set(manager.list_loras()) == {4}
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == {4}
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] == 4
 
-    assert manager.remove_oldest_lora()
-    assert set(manager.list_loras()) == set()
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == set()
     assert all(x is None for x in manager.lora_index_to_id)
 
-    assert not manager.remove_oldest_lora()
-    assert set(manager.list_loras()) == set()
+    assert not manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == set()
     assert all(x is None for x in manager.lora_index_to_id)
 
     # pinning
-    assert manager.add_lora(model_lora3)
-    assert manager.activate_lora(3)
-    assert manager.add_lora(model_lora4)
-    assert manager.activate_lora(4)
-    assert set(manager.list_loras()) == {3, 4}
+    assert manager.add_adapter(model_lora3)
+    assert manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(4)
+    assert set(manager.list_adapters()) == {3, 4}
     with pytest.raises(ValueError):
-        assert manager.pin_lora(1)
-    assert manager.pin_lora(3)
+        assert manager.pin_adapter(1)
+    assert manager.pin_adapter(3)
     # Remove manually
-    assert manager.remove_lora(3)
-    assert not manager.remove_lora(3)
+    assert manager.remove_adapter(3)
+    assert not manager.remove_adapter(3)
 
-    assert set(manager.list_loras()) == {4}
+    assert set(manager.list_adapters()) == {4}
     assert manager.lora_index_to_id[0] is None
     assert manager.lora_index_to_id[1] == 4
 
-    assert manager.add_lora(model_lora1)
-    assert manager.pin_lora(1)
-    assert manager.add_lora(model_lora2)
-    assert manager.activate_lora(2)
+    assert manager.add_adapter(model_lora1)
+    assert manager.pin_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
 
-    assert set(manager.list_loras()) == {1, 2}
+    assert set(manager.list_adapters()) == {1, 2}
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] == 2
 
-    assert manager.remove_oldest_lora()
-    assert set(manager.list_loras()) == {1}
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == {1}
     assert manager.lora_index_to_id[0] == 1
     assert manager.lora_index_to_id[1] is None
 
     with pytest.raises(RuntimeError):
-        assert manager.remove_oldest_lora()
+        assert manager.remove_oldest_adapter()
 
-    assert set(manager.list_loras()) == {1}
+    assert set(manager.list_adapters()) == {1}
 
 
-def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
-                                       sql_lora_files):
+def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+                                          sql_lora_files):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
-    worker_lora_manager = LRUCacheWorkerLoRAManager(
+    worker_adapter_manager = LRUCacheWorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
+    worker_adapter_manager.create_lora_manager(
+        llama_2_7b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("2", 2, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager.list_adapters() == {1, 2}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("3", 3, sql_lora_files),
         LoRARequest("4", 4, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2, 3, 4}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 3
-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+    assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("2", 2, sql_lora_files),
         LoRARequest("5", 5, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("1", 1, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("6", 6, sql_lora_files),
         LoRARequest("7", 7, sql_lora_files),
         LoRARequest("8", 8, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 6, 7, 8}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 7
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 8
-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 6
+    assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 8
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 6
 
     # Over capacity
     with pytest.raises(RuntimeError):
-        worker_lora_manager.set_active_loras([
+        worker_adapter_manager.set_active_adapters([
             LoRARequest("10", 10, sql_lora_files),
             LoRARequest("11", 11, sql_lora_files),
             LoRARequest("12", 12, sql_lora_files),
@@ -426,68 +427,69 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
         ], mapping)
 
 
-def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
-                             sql_lora_files):
+def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+                                sql_lora_files):
     # Should remove every LoRA not specified in the request.
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
-    worker_lora_manager = WorkerLoRAManager(
+    worker_adapter_manager = WorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
+    worker_adapter_manager.create_lora_manager(
+        llama_2_7b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("2", 2, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager.list_adapters() == {1, 2}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("3", 3, sql_lora_files),
         LoRARequest("4", 4, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 3, 4}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 3
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 4
+    assert worker_adapter_manager.list_adapters() == {1, 3, 4}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("2", 2, sql_lora_files),
         LoRARequest("5", 5, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1, 2, 5}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager.list_adapters() == {1, 2, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("1", 1, sql_lora_files),
         LoRARequest("1", 1, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {1}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] is None
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] is None
+    assert worker_adapter_manager.list_adapters() == {1}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
 
-    worker_lora_manager.set_active_loras([
+    worker_adapter_manager.set_active_adapters([
         LoRARequest("6", 6, sql_lora_files),
         LoRARequest("7", 7, sql_lora_files),
         LoRARequest("8", 8, sql_lora_files)
     ], mapping)
-    assert worker_lora_manager.list_loras() == {6, 7, 8}
-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 8
-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 6
-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 7
+    assert worker_adapter_manager.list_adapters() == {6, 7, 8}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 7
 
     # Over capacity
     with pytest.raises(RuntimeError):
-        worker_lora_manager.set_active_loras([
+        worker_adapter_manager.set_active_adapters([
             LoRARequest("10", 10, sql_lora_files),
             LoRARequest("11", 11, sql_lora_files),
             LoRARequest("12", 12, sql_lora_files),
@@ -525,8 +527,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up):
 
     assert isinstance(model.get_submodule("gate_up_proj"),
                       MergedColumnParallelLinearWithLoRA)
-    assert manager.add_lora(model_lora)
-    assert manager.add_lora(model_lora1)
+    assert manager.add_adapter(model_lora)
+    assert manager.add_adapter(model_lora1)
 
     packed_lora = model_lora.get_lora("gate_up_proj")
     assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
new file mode 100644
index 000000000..6528b3009
--- /dev/null
+++ b/tests/prompt_adapter/test_bloom.py
@@ -0,0 +1,45 @@
+import pytest
+
+import vllm
+from vllm.prompt_adapter.request import PromptAdapterRequest
+
+MODEL_PATH = "bigscience/bloomz-560m"
+PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
+
+
+def do_sample(llm, pa_name: str, pa_id: int):
+
+    prompts = [
+        "Tweet text : @nationalgridus I have no water and the bill is \
+        current and paid. Can you do something about this? Label : ",
+        "Tweet text : @nationalgridus Looks good thanks! Label : "
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0.0,
+                                          max_tokens=3,
+                                          stop_token_ids=[3])
+
+    outputs = llm.generate(prompts,
+                           sampling_params,
+                           prompt_adapter_request=PromptAdapterRequest(
+                               pa_name, pa_id, PA_PATH, 8) if pa_id else None)
+
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_twitter_prompt_adapter(enforce_eager: bool):
+    llm = vllm.LLM(MODEL_PATH,
+                   enforce_eager=enforce_eager,
+                   enable_prompt_adapter=True,
+                   max_prompt_adapter_token=8)
+
+    expected_output = ['complaint', 'no complaint']
+
+    assert do_sample(llm, "twitter_pa", pa_id=1) == expected_output
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
new file mode 100644
index 000000000..39a79becd
--- /dev/null
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
@@ -0,0 +1,53 @@
+from vllm import EngineArgs, LLMEngine, SamplingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+
+MODEL_PATH = "bigscience/bloomz-560m"
+pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
+pa_path2 = 'swapnilbp/angry_tweet_ptune'
+
+
+def do_sample(engine):
+
+    prompts = [
+        ("Tweet text: I have complaints! Label: ",
+         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+         PromptAdapterRequest("hate_speech", 1, pa_path2, 8)),
+        ("Tweet text: I have no problems Label: ",
+         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+         PromptAdapterRequest("hate_speech2", 2, pa_path2, 8)),
+        ("Tweet text: I have complaints! Label: ",
+         SamplingParams(temperature=0.0, max_tokens=3), None),
+        ("Tweet text: I have no problems Label: ",
+         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+         PromptAdapterRequest("complain", 3, pa_path, 8)),
+    ]
+
+    request_id = 0
+    results = set()
+    while prompts or engine.has_unfinished_requests():
+        if prompts:
+            prompt, sampling_params, pa_request = prompts.pop(0)
+            engine.add_request(str(request_id),
+                               prompt,
+                               sampling_params,
+                               prompt_adapter_request=pa_request)
+            request_id += 1
+
+        request_outputs = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                results.add(request_output.outputs[0].text)
+    return results
+
+
+def test_multi_prompt_adapters():
+    engine_args = EngineArgs(model=MODEL_PATH,
+                             max_prompt_adapters=3,
+                             enable_prompt_adapter=True,
+                             max_prompt_adapter_token=8)
+    engine = LLMEngine.from_engine_args(engine_args)
+    expected_output = {
+        ' quot;I', 'hate speech', 'no complaint', 'not hate speech'
+    }
+    assert do_sample(engine) == expected_output
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
new file mode 100644
index 000000000..2a5f23f7f
--- /dev/null
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -0,0 +1,61 @@
+from huggingface_hub import snapshot_download
+
+from vllm import EngineArgs, LLMEngine, SamplingParams
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
+lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+
+
+def do_sample(engine):
+
+    prompt_text = "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]"  # noqa: E501
+
+    # first prompt with a prompt adapter and second without adapter
+    prompts = [
+        (prompt_text,
+         SamplingParams(temperature=0.0, max_tokens=100,
+                        stop=["[/assistant]"]),
+         PromptAdapterRequest("hate_speech", 1, pa_path,
+                              8), LoRARequest("sql_test", 1, lora_path)),
+        (prompt_text,
+         SamplingParams(temperature=0.0, max_tokens=100,
+                        stop=["[/assistant]"]), None,
+         LoRARequest("sql_test", 1, lora_path)),
+    ]
+
+    request_id = 0
+    results = set()
+    while prompts or engine.has_unfinished_requests():
+        if prompts:
+            prompt, sampling_params, pa_request, lora_request = prompts.pop(0)
+            engine.add_request(str(request_id),
+                               prompt,
+                               sampling_params,
+                               prompt_adapter_request=pa_request,
+                               lora_request=lora_request)
+            request_id += 1
+
+        request_outputs = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                results.add(request_output.outputs[0].text)
+    return results
+
+
+def test_lora_prompt_adapter():
+    engine_args = EngineArgs(model=MODEL_PATH,
+                             enable_prompt_adapter=True,
+                             enable_lora=True,
+                             max_num_seqs=60,
+                             max_prompt_adapter_token=8)
+    engine = LLMEngine.from_engine_args(engine_args)
+    result = do_sample(engine)
+
+    expected_output = {
+        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' "  # noqa: E501
+    }
+    assert result == expected_output
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 8ad8e9cb8..fb3415b5d 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -13,6 +13,7 @@ from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
 from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
 from vllm.usage.usage_lib import UsageContext
@@ -92,6 +93,7 @@ class AsyncLLM:
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalDataDict] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> List[RequestOutput]:
 
         if prompts is None:
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index e1775790c..b5742c433 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -23,6 +23,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
         cache_config=engine_config.cache_config,
         load_config=engine_config.load_config,
         lora_config=engine_config.lora_config,
+        prompt_adapter_config=engine_config.prompt_adapter_config,
         is_driver_worker=True,
     )
     return model_runner
diff --git a/vllm/adapter_commons/__init__.py b/vllm/adapter_commons/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
new file mode 100644
index 000000000..3ed60678b
--- /dev/null
+++ b/vllm/adapter_commons/layers.py
@@ -0,0 +1,14 @@
+from dataclasses import dataclass
+from typing import Tuple
+
+
+@dataclass
+class AdapterMapping:
+    # Per every token in input_ids:
+    index_mapping: Tuple[int, ...]
+    # Per sampled token:
+    prompt_mapping: Tuple[int, ...]
+
+    def __post_init__(self):
+        self.index_mapping = tuple(self.index_mapping)
+        self.prompt_mapping = tuple(self.prompt_mapping)
\ No newline at end of file
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
new file mode 100644
index 000000000..6939b1405
--- /dev/null
+++ b/vllm/adapter_commons/models.py
@@ -0,0 +1,104 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Hashable, Optional, TypeVar
+
+from torch import nn
+
+from vllm.logger import init_logger
+from vllm.utils import LRUCache
+
+logger = init_logger(__name__)
+
+
+class AdapterModel(ABC):
+
+    def __init__(self, model_id=None):
+        self.id = model_id
+
+    @abstractmethod
+    def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs):
+        # Common initialization code
+        # Load weights or embeddings from local checkpoint
+        raise NotImplementedError("Subclasses must implement this method.")
+
+
+T = TypeVar('T')
+
+
+class AdapterLRUCache(LRUCache[T]):
+
+    def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable],
+                                                              None]):
+        super().__init__(capacity)
+        self.deactivate_fn = deactivate_fn
+
+    def _on_remove(self, key: Hashable, value: T):
+        logger.debug("Removing adapter int id: %d", key)
+        self.deactivate_fn(key)
+        return super()._on_remove(key, value)
+
+
+class AdapterModelManager(ABC):
+
+    def __init__(
+        self,
+        model: nn.Module,
+    ):
+        """Create a AdapterModelManager and adapter for a given model.
+        Args:
+            model: the model to be adapted.
+        """
+        self.model: nn.Module = model
+        self._registered_adapters: Dict[int, Any] = {}
+        # Dict instead of a Set for compatibility with LRUCache.
+        self._active_adapters: Dict[int, None] = {}
+        self.adapter_type = 'Adapter'
+        self._last_mapping = None
+
+    def __len__(self) -> int:
+        return len(self._registered_adapters)
+
+    @property
+    @abstractmethod
+    def adapter_slots(self):
+        ...
+
+    @property
+    @abstractmethod
+    def capacity(self):
+        ...
+
+    @abstractmethod
+    def activate_adapter(self, adapter_id: int) -> bool:
+        ...
+
+    @abstractmethod
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        ...
+
+    @abstractmethod
+    def add_adapter(self, adapter: Any) -> bool:
+        ...
+
+    @abstractmethod
+    def set_adapter_mapping(self, mapping: Any) -> None:
+        ...
+
+    @abstractmethod
+    def remove_adapter(self, adapter_id: int) -> bool:
+        ...
+
+    @abstractmethod
+    def remove_all_adapters(self):
+        ...
+
+    @abstractmethod
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        ...
+
+    @abstractmethod
+    def list_adapters(self) -> Dict[int, Any]:
+        ...
+
+    @abstractmethod
+    def pin_adapter(self, adapter_id: int) -> bool:
+        ...
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
new file mode 100644
index 000000000..69775ab7d
--- /dev/null
+++ b/vllm/adapter_commons/request.py
@@ -0,0 +1,25 @@
+from abc import abstractmethod
+from dataclasses import dataclass
+
+
+@dataclass
+class AdapterRequest:
+    """
+    Base class for adapter requests.
+    """
+
+    @property
+    @abstractmethod
+    def adapter_id(self):
+        ...
+
+    def __post_init__(self):
+        if self.adapter_id < 1:
+            raise ValueError(f"id must be > 0, got {self.adapter_id}")
+
+    def __eq__(self, value: object) -> bool:
+        return isinstance(
+            value, self.__class__) and self.adapter_id == value.adapter_id
+
+    def __hash__(self) -> int:
+        return hash(self.adapter_id)
diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py
new file mode 100644
index 000000000..6c5411f7d
--- /dev/null
+++ b/vllm/adapter_commons/utils.py
@@ -0,0 +1,90 @@
+from typing import Any, Callable, Dict, Optional, Set
+
+
+## model functions
+def deactivate_adapter(adapter_id: int, active_adapters: Dict[int, None],
+                       deactivate_func: Callable) -> bool:
+    if adapter_id in active_adapters:
+        deactivate_func(adapter_id)
+        active_adapters.pop(adapter_id)
+        return True
+    return False
+
+
+def add_adapter(adapter: Any, registered_adapters: Dict[int, Any],
+                capacity: int, add_func: Callable) -> bool:
+    if adapter.id not in registered_adapters:
+        if len(registered_adapters) >= capacity:
+            raise RuntimeError('No free adapter slots.')
+        add_func(adapter)
+        registered_adapters[adapter.id] = adapter
+        return True
+    return False
+
+
+def set_adapter_mapping(mapping: Any, last_mapping: Any,
+                        set_mapping_func: Callable) -> Any:
+    if last_mapping != mapping:
+        set_mapping_func(mapping)
+        return mapping
+    return last_mapping
+
+
+def remove_adapter(adapter_id: int, registered_adapters: Dict[int, Any],
+                   deactivate_func: Callable) -> bool:
+    deactivate_func(adapter_id)
+    return bool(registered_adapters.pop(adapter_id, None))
+
+
+def list_adapters(registered_adapters: Dict[int, Any]) -> Dict[int, Any]:
+    return dict(registered_adapters)
+
+
+def get_adapter(adapter_id: int,
+                registered_adapters: Dict[int, Any]) -> Optional[Any]:
+    return registered_adapters.get(adapter_id, None)
+
+
+## worker functions
+def set_active_adapters_worker(requests: Set[Any], mapping: Optional[Any],
+                               apply_adapters_func,
+                               set_adapter_mapping_func) -> None:
+    apply_adapters_func(requests)
+    set_adapter_mapping_func(mapping)
+
+
+def add_adapter_worker(adapter_request: Any, list_adapters_func,
+                       load_adapter_func, add_adapter_func,
+                       activate_adapter_func) -> bool:
+    if adapter_request.adapter_id in list_adapters_func():
+        return False
+    loaded_adapter = load_adapter_func(adapter_request)
+    loaded = add_adapter_func(loaded_adapter)
+    activate_adapter_func(loaded_adapter.id)
+    return loaded
+
+
+def apply_adapters_worker(adapter_requests: Set[Any], list_adapters_func,
+                          adapter_slots: int, remove_adapter_func,
+                          add_adapter_func) -> None:
+    models_that_exist = list_adapters_func()
+    models_map = {
+        adapter_request.adapter_id: adapter_request
+        for adapter_request in adapter_requests if adapter_request
+    }
+    if len(models_map) > adapter_slots:
+        raise RuntimeError(
+            f"Number of requested models ({len(models_map)}) is greater "
+            f"than the number of GPU model slots "
+            f"({adapter_slots}).")
+    new_models = set(models_map)
+    models_to_add = new_models - models_that_exist
+    models_to_remove = models_that_exist - new_models
+    for adapter_id in models_to_remove:
+        remove_adapter_func(adapter_id)
+    for adapter_id in models_to_add:
+        add_adapter_func(models_map[adapter_id])
+
+
+def list_adapters_worker(adapter_manager_list_adapters_func) -> Set[int]:
+    return set(adapter_manager_list_adapters_func())
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
new file mode 100644
index 000000000..acf18993a
--- /dev/null
+++ b/vllm/adapter_commons/worker_manager.py
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Set
+
+import torch
+
+
+class AbstractWorkerManager(ABC):
+
+    def __init__(self, device: torch.device):
+        self.device = device
+
+    @property
+    @abstractmethod
+    def is_enabled(self) -> bool:
+        ...
+
+    @abstractmethod
+    def set_active_adapters(self, requests: Set[Any],
+                            mapping: Optional[Any]) -> None:
+        ...
+
+    @abstractmethod
+    def add_adapter(self, adapter_request: Any) -> bool:
+        ...
+
+    @abstractmethod
+    def remove_adapter(self, adapter_id: int) -> bool:
+        ...
+
+    @abstractmethod
+    def remove_all_adapters(self):
+        ...
+
+    @abstractmethod
+    def list_adapters(self) -> Set[int]:
+        ...
diff --git a/vllm/config.py b/vllm/config.py
index 1ea288879..68ca81a2e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1285,6 +1285,39 @@ class LoRAConfig:
             raise ValueError("LoRA is not supported with chunked prefill yet.")
 
 
+@dataclass
+class PromptAdapterConfig:
+    max_prompt_adapters: int
+    max_prompt_adapter_token: int
+    max_cpu_prompt_adapters: Optional[int] = None
+    prompt_adapter_dtype: Optional[torch.dtype] = None
+
+    def __post_init__(self):
+        library_name = 'peft'
+        try:
+            __import__(library_name)
+        except ImportError as e:
+            raise ImportError(
+                f"'{library_name}' is not installed for prompt adapter support."
+                f"Please install it using 'pip install {library_name}'."
+            ) from e
+
+        if self.max_prompt_adapters < 1:
+            raise ValueError(f"max_prompt_adapters "
+                             f"({self.max_prompt_adapters}) must be >= 1.")
+        if self.max_prompt_adapter_token == 0:
+            raise ValueError("max_prompt_adapter_token must be set.")
+        if self.max_cpu_prompt_adapters is None:
+            self.max_cpu_prompt_adapters = self.max_prompt_adapters
+
+    def verify_with_model_config(self, model_config: ModelConfig):
+        if self.prompt_adapter_dtype in (None, "auto"):
+            self.prompt_adapter_dtype = model_config.dtype
+        elif isinstance(self.prompt_adapter_dtype, str):
+            self.prompt_adapter_dtype = getattr(torch,
+                                                self.prompt_adapter_dtype)
+
+
 @dataclass
 class MultiModalConfig:
     """Configs the input data format and how models should run for
@@ -1518,6 +1551,7 @@ class EngineConfig:
     speculative_config: Optional[SpeculativeConfig]
     decoding_config: Optional[DecodingConfig]
     observability_config: Optional[ObservabilityConfig]
+    prompt_adapter_config: Optional[PromptAdapterConfig]
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
@@ -1529,6 +1563,9 @@ class EngineConfig:
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)
+        if self.prompt_adapter_config:
+            self.prompt_adapter_config.verify_with_model_config(
+                self.model_config)
 
     def to_dict(self):
         """Return the configs as a dictionary, for use in **kwargs.
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 9e626b288..6bda18cd4 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -11,6 +11,7 @@ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.core.policy import Policy, PolicyFactory
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceStatus)
 
@@ -139,6 +140,8 @@ class SchedulerOutputs:
         if self.num_loras > 0:
             self._sort_by_lora_ids()
 
+        self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
+
     def is_empty(self) -> bool:
         # NOTE: We do not consider the ignored sequence groups.
         return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
@@ -157,6 +160,14 @@ class SchedulerOutputs:
             if g.seq_group.lora_request is not None
         }
 
+    @property
+    def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
+        return {
+            g.seq_group.prompt_adapter_request
+            for g in self.scheduled_seq_groups
+            if g.seq_group.prompt_adapter_request is not None
+        }
+
 
 @dataclass
 class SchedulerRunningOutputs:
@@ -1024,6 +1035,7 @@ class Scheduler:
                 # `multi_modal_data` will be None.
                 multi_modal_data=seq_group.multi_modal_data
                 if scheduler_outputs.num_prefill_groups > 0 else None,
+                prompt_adapter_request=seq_group.prompt_adapter_request,
             )
             seq_group_metadata_list.append(seq_group_metadata)
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index afa6892d4..b972573c0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,8 +7,8 @@ from typing import List, Optional, Tuple, Union
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
                          MultiModalConfig, ObservabilityConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig,
-                         TokenizerPoolConfig)
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig, TokenizerPoolConfig)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
 
@@ -66,6 +66,9 @@ class EngineArgs:
     enable_lora: bool = False
     max_loras: int = 1
     max_lora_rank: int = 16
+    enable_prompt_adapter: bool = False
+    max_prompt_adapters: int = 1
+    max_prompt_adapter_token: int = 0
     fully_sharded_loras: bool = False
     lora_extra_vocab_size: int = 256
     long_lora_scaling_factors: Optional[Tuple[float]] = None
@@ -449,6 +452,17 @@ class EngineArgs:
                   'Enabling this will use the fully sharded layers. '
                   'At high sequence length, max rank or '
                   'tensor parallel size, this is likely faster.'))
+        parser.add_argument('--enable-prompt-adapter',
+                            action='store_true',
+                            help='If True, enable handling of PromptAdapters.')
+        parser.add_argument('--max-prompt-adapters',
+                            type=int,
+                            default=EngineArgs.max_prompt_adapters,
+                            help='Max number of PromptAdapters in a batch.')
+        parser.add_argument('--max-prompt-adapter-token',
+                            type=int,
+                            default=EngineArgs.max_prompt_adapter_token,
+                            help='Max number of PromptAdapters tokens')
         parser.add_argument("--device",
                             type=str,
                             default=EngineArgs.device,
@@ -726,6 +740,11 @@ class EngineArgs:
             model_loader_extra_config=self.model_loader_extra_config,
         )
 
+        prompt_adapter_config = PromptAdapterConfig(
+            max_prompt_adapters=self.max_prompt_adapters,
+            max_prompt_adapter_token=self.max_prompt_adapter_token) \
+                                        if self.enable_prompt_adapter else None
+
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
 
@@ -751,6 +770,7 @@ class EngineArgs:
             load_config=load_config,
             decoding_config=decoding_config,
             observability_config=observability_config,
+            prompt_adapter_config=prompt_adapter_config,
         )
 
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 33e40c7b3..9b4ef48b0 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -18,6 +18,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.usage.usage_lib import UsageContext
@@ -264,6 +265,7 @@ class _AsyncLLMEngine(LLMEngine):
         request_id: str,
         inputs: PromptInputs,
         lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> LLMInputs:
         if isinstance(inputs, str):
             inputs = {"prompt": inputs}
@@ -279,6 +281,12 @@ class _AsyncLLMEngine(LLMEngine):
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
+        if prompt_adapter_request:
+            prompt_token_ids = [
+                0
+            ] * prompt_adapter_request.prompt_adapter_num_virtual_tokens + \
+                prompt_token_ids
+
         llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
                                prompt=inputs.get("prompt"),
                                multi_modal_data=inputs.get("multi_modal_data"))
@@ -286,13 +294,14 @@ class _AsyncLLMEngine(LLMEngine):
         return self.input_processor(llm_inputs)
 
     async def add_request_async(
-        self,
-        request_id: str,
-        inputs: PromptInputs,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Dict[str, str]] = None,
+            self,
+            request_id: str,
+            inputs: PromptInputs,
+            params: Union[SamplingParams, PoolingParams],
+            arrival_time: Optional[float] = None,
+            lora_request: Optional[LoRARequest] = None,
+            trace_headers: Optional[Dict[str, str]] = None,
+            prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -301,7 +310,10 @@ class _AsyncLLMEngine(LLMEngine):
             arrival_time = time.time()
 
         processed_inputs = await self.process_model_inputs_async(
-            request_id=request_id, inputs=inputs, lora_request=lora_request)
+            request_id=request_id,
+            inputs=inputs,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
 
         self._add_processed_request(
             request_id=request_id,
@@ -309,6 +321,7 @@ class _AsyncLLMEngine(LLMEngine):
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
             trace_headers=trace_headers,
         )
 
@@ -627,6 +640,7 @@ class AsyncLLMEngine:
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Dict[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> AsyncStream:
         if self.log_requests:
             if isinstance(inputs, str):
@@ -669,7 +683,7 @@ class AsyncLLMEngine:
             arrival_time=arrival_time,
             lora_request=lora_request,
             trace_headers=trace_headers,
-        )
+            prompt_adapter_request=prompt_adapter_request)
 
         return stream
 
@@ -680,6 +694,7 @@ class AsyncLLMEngine:
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Dict[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> AsyncIterator[RequestOutput]:
         """Generate outputs for a request.
 
@@ -695,6 +710,8 @@ class AsyncLLMEngine:
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
             trace_headers: OpenTelemetry trace headers.
+            prompt_adapter_request: Prompt Adapter request to use 
+                                            for generation, if any.
 
         Yields:
             The output `RequestOutput` objects from the LLMEngine
@@ -749,6 +766,7 @@ class AsyncLLMEngine:
                 sampling_params,
                 lora_request=lora_request,
                 trace_headers=trace_headers,
+                prompt_adapter_request=prompt_adapter_request,
         ):
             yield LLMEngine.validate_output(output, RequestOutput)
 
@@ -837,6 +855,7 @@ class AsyncLLMEngine:
         *,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Dict[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Common logic to process requests with SamplingParams or
         PoolingParams."""
@@ -849,6 +868,7 @@ class AsyncLLMEngine:
             arrival_time=arrival_time,
             lora_request=lora_request,
             trace_headers=trace_headers,
+            prompt_adapter_request=prompt_adapter_request,
         )
 
         try:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index de7604ece..b476594fc 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -8,7 +8,8 @@ from transformers import PreTrainedTokenizer
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
                          LoRAConfig, ModelConfig, MultiModalConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         ObservabilityConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
                          SpeculativeConfig)
 from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
                                  SchedulerOutputs)
@@ -27,6 +28,7 @@ from vllm.lora.request import LoRARequest
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
                            PoolerOutput, SamplerOutput, Sequence,
@@ -93,6 +95,8 @@ class LLMEngine:
             decoding.
         executor_class: The model executor class for managing distributed
             execution.
+        prompt_adapter_config (Optional): The configuration related to serving 
+            prompt adapters.
         log_stats: Whether to log statistics.
         usage_context: Specified entry point, used for usage info collection.
     """
@@ -161,6 +165,7 @@ class LLMEngine:
         speculative_config: Optional[SpeculativeConfig],
         decoding_config: Optional[DecodingConfig],
         observability_config: Optional[ObservabilityConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -222,6 +227,7 @@ class LLMEngine:
         self.speculative_config = speculative_config
         self.load_config = load_config
         self.decoding_config = decoding_config or DecodingConfig()
+        self.prompt_adapter_config = prompt_adapter_config
         self.observability_config = observability_config or ObservabilityConfig(
         )
         self.log_stats = log_stats
@@ -250,6 +256,7 @@ class LLMEngine:
             multimodal_config=multimodal_config,
             speculative_config=speculative_config,
             load_config=load_config,
+            prompt_adapter_config=prompt_adapter_config,
         )
 
         if not self.model_config.embedding_mode:
@@ -282,6 +289,8 @@ class LLMEngine:
                     # Feature flags
                     "enable_lora":
                     bool(lora_config),
+                    "enable_prompt_adapter":
+                    bool(prompt_adapter_config),
                     "enable_prefix_caching":
                     cache_config.enable_prefix_caching,
                     "enforce_eager":
@@ -376,7 +385,6 @@ class LLMEngine:
         engine_config = engine_args.create_engine_config()
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
-
         # Initialize the cluster and specify the executor class.
         if engine_config.device_config.device_type == "neuron":
             from vllm.executor.neuron_executor import NeuronExecutor
@@ -409,7 +417,6 @@ class LLMEngine:
         else:
             from vllm.executor.gpu_executor import GPUExecutor
             executor_class = GPUExecutor
-
         # Create the LLM engine.
         engine = cls(
             **engine_config.to_dict(),
@@ -470,6 +477,9 @@ class LLMEngine:
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)
+        if self.prompt_adapter_config:
+            self.prompt_adapter_config.verify_with_model_config(
+                self.model_config)
 
     def _get_eos_token_id(
             self, lora_request: Optional[LoRARequest]) -> Optional[int]:
@@ -487,6 +497,7 @@ class LLMEngine:
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
         lora_request: Optional[LoRARequest],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
         trace_headers: Optional[Dict[str, str]] = None,
     ) -> None:
         # Create the sequences.
@@ -495,7 +506,7 @@ class LLMEngine:
         eos_token_id = self._get_eos_token_id(lora_request)
 
         seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id,
-                       lora_request)
+                       lora_request, prompt_adapter_request)
 
         # Create a SequenceGroup based on SamplingParams or PoolingParams
         if isinstance(params, SamplingParams):
@@ -506,7 +517,7 @@ class LLMEngine:
                 arrival_time=arrival_time,
                 lora_request=lora_request,
                 trace_headers=trace_headers,
-            )
+                prompt_adapter_request=prompt_adapter_request)
         elif isinstance(params, PoolingParams):
             seq_group = self._create_sequence_group_with_pooling(
                 request_id,
@@ -514,7 +525,7 @@ class LLMEngine:
                 params,
                 arrival_time=arrival_time,
                 lora_request=lora_request,
-            )
+                prompt_adapter_request=prompt_adapter_request)
         else:
             raise ValueError(
                 "Either SamplingParams or PoolingParams must be provided.")
@@ -535,6 +546,7 @@ class LLMEngine:
         request_id: str,
         inputs: PromptInputs,
         lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> LLMInputs:
         if isinstance(inputs, str):
             inputs = {"prompt": inputs}
@@ -549,6 +561,11 @@ class LLMEngine:
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
+        if prompt_adapter_request:
+            prompt_token_ids = \
+                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens\
+                         + prompt_token_ids
+
         llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
                                prompt=inputs.get("prompt"),
                                multi_modal_data=inputs.get("multi_modal_data"))
@@ -563,6 +580,7 @@ class LLMEngine:
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Dict[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         """Add a request to the engine's request pool.
 
@@ -612,9 +630,11 @@ class LLMEngine:
         if arrival_time is None:
             arrival_time = time.time()
 
-        processed_inputs = self.process_model_inputs(request_id=request_id,
-                                                     inputs=inputs,
-                                                     lora_request=lora_request)
+        processed_inputs = self.process_model_inputs(
+            request_id=request_id,
+            inputs=inputs,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
 
         self._add_processed_request(
             request_id=request_id,
@@ -622,6 +642,7 @@ class LLMEngine:
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
             trace_headers=trace_headers,
         )
 
@@ -633,6 +654,7 @@ class LLMEngine:
         arrival_time: float,
         lora_request: Optional[LoRARequest],
         trace_headers: Optional[Dict[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> SequenceGroup:
         """Creates a SequenceGroup with SamplingParams."""
         max_logprobs = self.get_model_config().max_logprobs
@@ -658,7 +680,7 @@ class LLMEngine:
             sampling_params=sampling_params,
             lora_request=lora_request,
             trace_headers=trace_headers,
-        )
+            prompt_adapter_request=prompt_adapter_request)
 
         return seq_group
 
@@ -669,16 +691,19 @@ class LLMEngine:
         pooling_params: PoolingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
     ) -> SequenceGroup:
         """Creates a SequenceGroup with PoolingParams."""
         # Defensive copy of PoolingParams, which are used by the pooler
         pooling_params = pooling_params.clone()
         # Create the sequence group.
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  lora_request=lora_request,
-                                  pooling_params=pooling_params)
+        seq_group = SequenceGroup(
+            request_id=request_id,
+            seqs=[seq],
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            pooling_params=pooling_params,
+            prompt_adapter_request=prompt_adapter_request)
         return seq_group
 
     def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
@@ -1082,6 +1107,16 @@ class LLMEngine:
     def pin_lora(self, lora_id: int) -> bool:
         return self.model_executor.pin_lora(lora_id)
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.model_executor.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_executor.remove_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> List[int]:
+        return self.model_executor.list_prompt_adapters()
+
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e3e506d49..57e81a631 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -13,6 +13,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
@@ -255,6 +256,7 @@ class LLM:
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -271,6 +273,8 @@ class LLM:
                 prompts and it is paired one by one with the prompt.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for 
+                generation, if any.
 
         Returns:
             A list of `RequestOutput` objects containing the
@@ -304,7 +308,7 @@ class LLM:
             inputs=inputs,
             params=sampling_params,
             lora_request=lora_request,
-        )
+            prompt_adapter_request=prompt_adapter_request)
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return LLMEngine.validate_outputs(outputs, RequestOutput)
@@ -397,6 +401,7 @@ class LLM:
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[EmbeddingRequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -412,6 +417,8 @@ class LLM:
                 use the default pooling parameters.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for 
+                generation, if any.
 
         Returns:
             A list of `EmbeddingRequestOutput` objects containing the
@@ -445,6 +452,7 @@ class LLM:
             inputs=inputs,
             params=pooling_params,
             lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -504,6 +512,7 @@ class LLM:
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
     ) -> None:
         if isinstance(inputs, (str, dict)):
             # Convert a single prompt to a list.
@@ -526,19 +535,23 @@ class LLM:
                 params[i] if isinstance(params, Sequence) else params,
                 lora_request=lora_request[i] if isinstance(
                     lora_request, Sequence) else lora_request,
-            )
+                prompt_adapter_request=prompt_adapter_request)
 
     def _add_request(
-        self,
-        inputs: PromptInputs,
-        params: Union[SamplingParams, PoolingParams],
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+            self,
+            inputs: PromptInputs,
+            params: Union[SamplingParams, PoolingParams],
+            lora_request: Optional[Union[List[LoRARequest],
+                                         LoRARequest]] = None,
+            prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> None:
         request_id = str(next(self.request_counter))
-        self.llm_engine.add_request(request_id,
-                                    inputs,
-                                    params,
-                                    lora_request=lora_request)
+        self.llm_engine.add_request(
+            request_id,
+            inputs,
+            params,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
 
     def _run_engine(
             self, *, use_tqdm: bool
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index d3ed1ec7a..6cba356c4 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -116,7 +116,7 @@ async def detokenize(request: DetokenizeRequest):
 
 @app.get("/v1/models")
 async def show_available_models():
-    models = await openai_serving_chat.show_available_models()
+    models = await openai_serving_completion.show_available_models()
     return JSONResponse(content=models.model_dump())
 
 
@@ -236,7 +236,8 @@ if __name__ == "__main__":
                                             args.lora_modules,
                                             args.chat_template)
     openai_serving_completion = OpenAIServingCompletion(
-        engine, model_config, served_model_names, args.lora_modules)
+        engine, model_config, served_model_names, args.lora_modules,
+        args.prompt_adapters)
     openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
                                                       served_model_names)
     app.root_path = args.root_path
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 59ad73bf0..81c474ecc 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -9,7 +9,8 @@ import json
 import ssl
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+                                                    PromptAdapterPath)
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -23,6 +24,16 @@ class LoRAParserAction(argparse.Action):
         setattr(namespace, self.dest, lora_list)
 
 
+class PromptAdapterParserAction(argparse.Action):
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        adapter_list = []
+        for item in values:
+            name, path = item.split('=')
+            adapter_list.append(PromptAdapterPath(name, path))
+        setattr(namespace, self.dest, adapter_list)
+
+
 def make_arg_parser():
     parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
@@ -65,6 +76,14 @@ def make_arg_parser():
         action=LoRAParserAction,
         help="LoRA module configurations in the format name=path. "
         "Multiple modules can be specified.")
+    parser.add_argument(
+        "--prompt-adapters",
+        type=nullable_str,
+        default=None,
+        nargs='+',
+        action=PromptAdapterParserAction,
+        help="Prompt adapter configurations in the format name=path. "
+        "Multiple adapters can be specified.")
     parser.add_argument("--chat-template",
                         type=nullable_str,
                         default=None,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 415bdbbd7..010d6f2eb 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -258,7 +258,7 @@ class OpenAIServingChat(OpenAIServing):
                 prompt=prompt,
                 add_special_tokens=request.add_special_tokens)
             sampling_params = request.to_sampling_params()
-            lora_request = self._maybe_get_lora(request)
+            _, lora_request = self._maybe_get_adapter(request)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 9c719d634..b53b058b5 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -22,7 +22,8 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
                                               TokenizeResponse, UsageInfo)
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
-                                                    OpenAIServing)
+                                                    OpenAIServing,
+                                                    PromptAdapterPath)
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
@@ -67,11 +68,13 @@ class OpenAIServingCompletion(OpenAIServing):
 
     def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
                  served_model_names: List[str],
-                 lora_modules: Optional[List[LoRAModulePath]]):
+                 lora_modules: Optional[List[LoRAModulePath]],
+                 prompt_adapters: Optional[List[PromptAdapterPath]]):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
-                         lora_modules=lora_modules)
+                         lora_modules=lora_modules,
+                         prompt_adapters=prompt_adapters)
 
     async def create_completion(self, request: CompletionRequest,
                                 raw_request: Request):
@@ -101,7 +104,12 @@ class OpenAIServingCompletion(OpenAIServing):
         generators: List[AsyncIterator[RequestOutput]] = []
         try:
             sampling_params = request.to_sampling_params()
-            lora_request = self._maybe_get_lora(request)
+            adapter_type, adapter_request = self._maybe_get_adapter(request)
+            lora_request, prompt_adapter_request = None, None
+            if adapter_type == 'LoRA':
+                lora_request, prompt_adapter_request = adapter_request, None
+            elif adapter_type == 'PromptAdapter':
+                lora_request, prompt_adapter_request = None, adapter_request
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend
@@ -147,6 +155,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     sampling_params,
                     f"{request_id}-{i}",
                     lora_request=lora_request,
+                    prompt_adapter_request=prompt_adapter_request,
                     trace_headers=trace_headers,
                 )
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8d281c51f..58e6571d3 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -16,12 +16,19 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               ModelPermission, TokenizeRequest)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 logger = init_logger(__name__)
 
 
+@dataclass
+class PromptAdapterPath:
+    name: str
+    local_path: str
+
+
 @dataclass
 class LoRAModulePath:
     name: str
@@ -30,9 +37,14 @@ class LoRAModulePath:
 
 class OpenAIServing:
 
-    def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
-                 served_model_names: List[str],
-                 lora_modules: Optional[List[LoRAModulePath]]):
+    def __init__(
+        self,
+        engine: AsyncLLMEngine,
+        model_config: ModelConfig,
+        served_model_names: List[str],
+        lora_modules: Optional[List[LoRAModulePath]],
+        prompt_adapters: Optional[List[PromptAdapterPath]] = None,
+    ):
         super().__init__()
 
         self.engine = engine
@@ -49,9 +61,8 @@ class OpenAIServing:
 
         self.served_model_names = served_model_names
 
-        if lora_modules is None:
-            self.lora_requests = []
-        else:
+        self.lora_requests = []
+        if lora_modules is not None:
             self.lora_requests = [
                 LoRARequest(
                     lora_name=lora.name,
@@ -60,6 +71,20 @@ class OpenAIServing:
                 ) for i, lora in enumerate(lora_modules, start=1)
             ]
 
+        self.prompt_adapter_requests = []
+        if prompt_adapters is not None:
+            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
+                with open(f"./{prompt_adapter.local_path}"
+                          f"/adapter_config.json") as f:
+                    adapter_config = json.load(f)
+                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
+                self.prompt_adapter_requests.append(
+                    PromptAdapterRequest(
+                        prompt_adapter_name=prompt_adapter.name,
+                        prompt_adapter_id=i,
+                        prompt_adapter_local_path=prompt_adapter.local_path,
+                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
+
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
@@ -75,7 +100,14 @@ class OpenAIServing:
                       permission=[ModelPermission()])
             for lora in self.lora_requests
         ]
+        prompt_adapter_cards = [
+            ModelCard(id=prompt_adapter.prompt_adapter_name,
+                      root=self.served_model_names[0],
+                      permission=[ModelPermission()])
+            for prompt_adapter in self.prompt_adapter_requests
+        ]
         model_cards.extend(lora_cards)
+        model_cards.extend(prompt_adapter_cards)
         return ModelList(data=model_cards)
 
     def create_error_response(
@@ -109,20 +141,29 @@ class OpenAIServing:
             return None
         if request.model in [lora.lora_name for lora in self.lora_requests]:
             return None
+        if request.model in [
+                prompt_adapter.prompt_adapter_name
+                for prompt_adapter in self.prompt_adapter_requests
+        ]:
+            return None
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND)
 
-    def _maybe_get_lora(
+    def _maybe_get_adapter(
         self, request: Union[CompletionRequest, ChatCompletionRequest,
                              EmbeddingRequest]
-    ) -> Optional[LoRARequest]:
+    ) -> Tuple[Optional[str], Optional[Union[LoRARequest,
+                                             PromptAdapterRequest]]]:
         if request.model in self.served_model_names:
-            return None
+            return None, None
         for lora in self.lora_requests:
             if request.model == lora.lora_name:
-                return lora
+                return 'LoRA', lora
+        for prompt_adapter in self.prompt_adapter_requests:
+            if request.model == prompt_adapter.prompt_adapter_name:
+                return 'PromptAdapter', prompt_adapter
         # if _check_model has been called earlier, this will be unreachable
         raise ValueError(f"The model `{request.model}` does not exist.")
 
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 3b5621f70..d3b60e3ff 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -7,6 +7,7 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
@@ -48,6 +49,7 @@ class CPUExecutor(ExecutorBase):
             lora_config=self.lora_config,
             multimodal_config=self.multimodal_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
+            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=True,
         )
         self.driver_worker.init_device()
@@ -90,6 +92,19 @@ class CPUExecutor(ExecutorBase):
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.driver_worker.list_prompt_adapters()
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
+
     def check_health(self) -> None:
         # CPUExecutor will always be healthy as long as
         # it's running.
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index fc18dec0b..6f9e55445 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -4,8 +4,10 @@ from typing import List, Optional, Set, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig)
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 
 
@@ -28,6 +30,7 @@ class ExecutorBase(ABC):
         lora_config: Optional[LoRAConfig],
         multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
     ) -> None:
         self.model_config = model_config
         self.cache_config = cache_config
@@ -38,6 +41,7 @@ class ExecutorBase(ABC):
         self.device_config = device_config
         self.multimodal_config = multimodal_config
         self.speculative_config = speculative_config
+        self.prompt_adapter_config = prompt_adapter_config
 
         self._init_executor()
 
@@ -95,6 +99,23 @@ class ExecutorBase(ABC):
     def list_loras(self) -> Set[int]:
         raise NotImplementedError
 
+    @abstractmethod
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError  # type: ignore
+
+    @abstractmethod
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError
+
     @abstractmethod
     def check_health(self) -> None:
         """Checks if the executor is healthy. If not, it should raise an
@@ -122,12 +143,14 @@ class ExecutorAsyncBase(ExecutorBase):
         lora_config: Optional[LoRAConfig],
         multimodal_config: Optional[MultiModalConfig],
         speculative_config: Optional[SpeculativeConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
     ) -> None:
         self.pp_locks: Optional[List[asyncio.Lock]] = None
 
         super().__init__(model_config, cache_config, parallel_config,
                          scheduler_config, device_config, load_config,
-                         lora_config, multimodal_config, speculative_config)
+                         lora_config, multimodal_config, speculative_config,
+                         prompt_adapter_config)
 
     @abstractmethod
     async def execute_model_async(
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 7d3183a42..6ffc28d21 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
@@ -45,6 +46,7 @@ class GPUExecutor(ExecutorBase):
             lora_config=self.lora_config,
             multimodal_config=self.multimodal_config,
             speculative_config=self.speculative_config,
+            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=(not self.parallel_config)
             or (rank % self.parallel_config.tensor_parallel_size == 0),
         )
@@ -107,6 +109,25 @@ class GPUExecutor(ExecutorBase):
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        assert prompt_adapter_request.prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        assert prompt_adapter_id > 0, \
+                "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.driver_worker.list_prompt_adapters()
+
     def check_health(self) -> None:
         # GPUExecutor will always be healthy as long as
         # it's running.
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
index f02d49783..33f9321b5 100644
--- a/vllm/executor/ray_xpu_executor.py
+++ b/vllm/executor/ray_xpu_executor.py
@@ -8,7 +8,8 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig)
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
@@ -44,6 +45,7 @@ class RayXPUExecutor(DistributedGPUExecutor):
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         multimodal_config: Optional[MultiModalConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         assert device_config.device_type == "xpu"
@@ -58,6 +60,7 @@ class RayXPUExecutor(DistributedGPUExecutor):
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.multimodal_config = multimodal_config
+        self.prompt_adapter_config = prompt_adapter_config
 
         placement_group = self.parallel_config.placement_group
 
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
index 29b246332..f6550cce9 100644
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
@@ -4,7 +4,8 @@ import torch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig)
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
@@ -27,6 +28,7 @@ class XPUExecutor(GPUExecutor):
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         multimodal_config: Optional[MultiModalConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         assert device_config.device_type == "xpu"
@@ -43,6 +45,7 @@ class XPUExecutor(GPUExecutor):
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.multimodal_config = multimodal_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.speculative_config = None
 
         # Instantiate the worker and load the model to GPU.
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 0a63f9ef0..40de134c0 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -8,6 +8,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
+from vllm.adapter_commons.layers import AdapterMapping
 from vllm.config import LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -134,15 +135,8 @@ def _apply_lora_packed_nslice(
 
 
 @dataclass
-class LoRAMapping:
-    # Per every token in input_ids:
-    index_mapping: Tuple[int, ...]
-    # Per sampled token:
-    prompt_mapping: Tuple[int, ...]
-
-    def __post_init__(self):
-        self.index_mapping = tuple(self.index_mapping)
-        self.prompt_mapping = tuple(self.prompt_mapping)
+class LoRAMapping(AdapterMapping):
+    pass
 
 
 class BaseLayerWithLoRA(nn.Module):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 689835def..e1ede7d4d 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,12 +4,17 @@ import math
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import safetensors.torch
 import torch
 from torch import nn
 
+from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
+                                         AdapterModelManager)
+from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
+                                        get_adapter, list_adapters,
+                                        remove_adapter, set_adapter_mapping)
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import (BaseLayerWithLoRA,
@@ -19,7 +24,7 @@ from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
-from vllm.utils import LRUCache, is_pin_memory_available
+from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -153,7 +158,7 @@ def get_lora_id():
     return _GLOBAL_LORA_ID
 
 
-class LoRAModel:
+class LoRAModel(AdapterModel):
     """A LoRA fine-tuned model."""
 
     def __init__(
@@ -388,7 +393,7 @@ class LoRAModel:
         )
 
 
-class LoRAModelManager:
+class LoRAModelManager(AdapterModelManager):
     """A manager that manages multiple LoRA-fine-tuned models."""
 
     def __init__(
@@ -440,8 +445,7 @@ class LoRAModelManager:
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices
         self.indices_len: List[Optional[int]] = [None] * 4
-
-        self.model = model
+        super().__init__(model)
         if hasattr(self.model, "supported_lora_modules"):
             self.supported_lora_modules = copy.deepcopy(
                 self.model.supported_lora_modules)
@@ -453,11 +457,11 @@ class LoRAModelManager:
                 self.model.packed_modules_mapping)
         self.packed_modules: Dict[str, List[str]] = {}
         self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
-        self._registered_loras: Dict[int, LoRAModel] = {}
         # Dict instead of a Set for compatibility with LRUCache.
-        self._active_loras: Dict[int, None] = {}
         self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
+        self.model.lora_manager = self
+        self.adapter_type = 'LoRa'
 
     @property
     def capacity(self) -> int:
@@ -467,15 +471,16 @@ class LoRAModelManager:
     def lora_slots(self) -> int:
         return self.lora_config.max_loras
 
-    def __len__(self) -> int:
-        return len(self._registered_loras)
+    @property
+    def adapter_slots(self) -> int:
+        return self.lora_slots
 
-    def activate_lora(
+    def activate_adapter(
         self,
         lora_id: int,
     ) -> bool:
         """Move LoRA into a GPU buffer to be used in the forward pass."""
-        if lora_id in self._active_loras:
+        if lora_id in self._active_adapters:
             return False
         first_free_slot = next(
             ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id)
@@ -483,8 +488,8 @@ class LoRAModelManager:
         if first_free_slot is None:
             raise ValueError("No free lora slots")
         index, _ = first_free_slot
-        self._active_loras[lora_id] = None
-        lora_model = self._registered_loras[lora_id]
+        self._active_adapters[lora_id] = None
+        lora_model = self._registered_adapters[lora_id]
         logger.debug("Activating LoRA. int id: %d, slot index: %d",
                      lora_model.id, index)
         self.lora_index_to_id[index] = lora_model.id
@@ -498,21 +503,13 @@ class LoRAModelManager:
                 module.reset_lora(index)
         return True
 
-    def _deactivate_lora(self, lora_id: int):
+    def _deactivate_adapter(self, lora_id: int):
         try:
             index = self.lora_index_to_id.index(lora_id)
             self.lora_index_to_id[index] = None
         except ValueError:
             pass
 
-    def deactivate_lora(self, lora_id: int) -> bool:
-        """Remove a LoRA from a GPU buffer."""
-        if lora_id in self._active_loras:
-            self._deactivate_lora(lora_id)
-            self._active_loras.pop(lora_id)
-            return True
-        return False
-
     def _set_long_lora_context(self, lora: LoRAModel):
         if self.long_lora_context is None:
             return
@@ -528,40 +525,19 @@ class LoRAModelManager:
         if offsets:
             self.long_lora_context.offsets_by_lora_id[lora.id] = offsets
 
-    def _add_lora(self, lora: LoRAModel):
+    def _add_adapter(self, lora: LoRAModel):
         self._create_merged_loras_inplace(lora)
-        self._registered_loras[lora.id] = lora
+        self._registered_adapters[lora.id] = lora
         self._set_long_lora_context(lora)
 
-    def add_lora(self, lora: LoRAModel) -> bool:
-        """Add a LoRAModel to the manager CPU cache."""
-        logger.debug(
-            "Adding lora. Model id: %d, "
-            "int id: %d, "
-            "scaling factor: %s", lora.id, lora.id, lora.scaling_factor)
-        if lora.id not in self._registered_loras:
-            if len(self._registered_loras) >= self.capacity:
-                raise RuntimeError("No free LoRA slots.")
-            self._add_lora(lora)
-            return True
-        return False
-
-    def remove_lora(self, lora_id: int) -> bool:
-        """Remove a LoRAModel from the manager CPU cache."""
-        # TODO: should we check active lora?
-        self.deactivate_lora(lora_id)
-        if self.long_lora_context:
-            self.long_lora_context.offsets_by_lora_id.pop(lora_id, None)
-        return bool(self._registered_loras.pop(lora_id, None))
-
-    def pin_lora(self, lora_id: int) -> bool:
+    def pin_adapter(self, lora_id: int) -> bool:
         """Pin a LoRAModel in the manager cache."""
         raise NotImplementedError(
             "Pinning is not supported in LoRAModelManager."
             "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
 
     # TODO see if this can be vectorized
-    def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
+    def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
         (base_indices, sampler_indices, sampler_indices_padded,
          embeddings_indices, long_lora_offsets_tensor,
          indices_len) = convert_mapping(mapping, self.lora_index_to_id,
@@ -583,23 +559,11 @@ class LoRAModelManager:
         # Maintain the reference
         self.indices_len[:] = indices_len
 
-    def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
-        if self._last_mapping != lora_mapping:
-            self._set_lora_mapping(lora_mapping)
-        self._last_mapping = lora_mapping
-
-    def list_loras(self) -> Dict[int, LoRAModel]:
-        """List all registered LoRAModels."""
-        return dict(self._registered_loras)
-
-    def get_lora(self, lora_id: int) -> Optional[LoRAModel]:
-        return self._registered_loras.get(lora_id, None)
-
-    def remove_all_loras(self):
+    def remove_all_adapters(self):
         """Remove all LoRAModels from the manager."""
-        self._registered_loras.clear()
+        self._registered_adapters.clear()
         self.lora_index_to_id = [None] * self.lora_slots
-        self._active_loras.clear()
+        self._active_adapters.clear()
 
     def _create_lora_modules(self):
         for module_name, module in self.model.named_modules(
@@ -743,18 +707,39 @@ class LoRAModelManager:
             lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
                 replacement_loras)
 
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        return deactivate_adapter(adapter_id, self._active_adapters,
+                                  self._deactivate_adapter)
+
+    def add_adapter(self, adapter: LoRAModel) -> bool:
+        logger.debug(
+            "Adding lora. Model id: %d, "
+            "int id: %d, "
+            "scaling factor: %s", adapter.id, adapter.id,
+            adapter.scaling_factor)
+        return add_adapter(adapter, self._registered_adapters, self.capacity,
+                           self._add_adapter)
 
-class LoRALRUCache(LRUCache[LoRAModel]):
+    def set_adapter_mapping(self, mapping: LoRAMapping) -> None:
+        self._last_mapping = set_adapter_mapping(mapping, self._last_mapping,
+                                                 self._set_adapter_mapping)
+
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return remove_adapter(adapter_id, self._registered_adapters,
+                              self.deactivate_adapter)
+
+    def list_adapters(self) -> Dict[int, Any]:
+        return list_adapters(self._registered_adapters)
+
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        return get_adapter(adapter_id, self._registered_adapters)
+
+
+class LoRALRUCache(AdapterLRUCache[LoRAModel]):
 
     def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
                                                                    bool]):
-        super().__init__(capacity)
-        self.deactivate_lora_fn = deactivate_lora_fn
-
-    def _on_remove(self, key: int, value: LoRAModel):
-        logger.debug("Removing LoRA. int id: %d", key)
-        self.deactivate_lora_fn(key)
-        return super()._on_remove(key, value)
+        super().__init__(capacity, deactivate_lora_fn)
 
 
 class LRUCacheLoRAModelManager(LoRAModelManager):
@@ -770,49 +755,49 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
     ):
         super().__init__(model, max_num_seqs, max_num_batched_tokens,
                          vocab_size, lora_config)
-        self._registered_loras: LoRALRUCache = LoRALRUCache(
-            self.capacity, self.deactivate_lora)
-        self._active_loras: LoRALRUCache = LoRALRUCache(
-            self.lora_slots, self._deactivate_lora)
+        self._registered_adapters: LoRALRUCache = LoRALRUCache(
+            self.capacity, self.deactivate_adapter)
+        self._active_adapters: LoRALRUCache = LoRALRUCache(
+            self.lora_slots, self._deactivate_adapter)
 
-    def list_loras(self) -> Dict[int, LoRAModel]:
+    def list_adapters(self) -> Dict[int, LoRAModel]:
         """List all registered LoRAModels."""
-        return dict(self._registered_loras.cache)
+        return dict(self._registered_adapters.cache)
 
-    def add_lora(self, lora: LoRAModel) -> bool:
+    def add_adapter(self, lora: LoRAModel) -> bool:
         """Add a LoRAModel to the manager."""
         logger.debug(
             "Adding lora. Model id: %d, "
             "int id: %d, "
             "scaling factor: %s", lora.id, lora.id, lora.scaling_factor)
-        if lora.id not in self._registered_loras:
-            self._add_lora(lora)
+        if lora.id not in self._registered_adapters:
+            self._add_adapter(lora)
             was_added = True
         else:
             # We always touch to update the LRU cache order
-            self._registered_loras.touch(lora.id)
+            self._registered_adapters.touch(lora.id)
             was_added = False
         return was_added
 
-    def activate_lora(
+    def activate_adapter(
         self,
         lora_id: int,
     ) -> bool:
-        if lora_id not in self._active_loras and len(
-                self._active_loras) >= self.lora_slots:
-            self._active_loras.remove_oldest()
-        result = super().activate_lora(lora_id)
+        if lora_id not in self._active_adapters and len(
+                self._active_adapters) >= self.lora_slots:
+            self._active_adapters.remove_oldest()
+        result = super().activate_adapter(lora_id)
         # We always touch to update the LRU cache order
-        self._active_loras.touch(lora_id)
+        self._active_adapters.touch(lora_id)
         return result
 
-    def remove_oldest_lora(self) -> bool:
-        if len(self._registered_loras) > 0:
-            self._registered_loras.remove_oldest()
+    def remove_oldest_adapter(self) -> bool:
+        if len(self._registered_adapters) > 0:
+            self._registered_adapters.remove_oldest()
             return True
         return False
 
-    def pin_lora(self, lora_id: int) -> bool:
+    def pin_adapter(self, lora_id: int) -> bool:
         """Pin a LoRAModel in the manager cache."""
         self._pin_lora_in_cpu_cache(lora_id)
         self._pin_lora_in_gpu_cache(lora_id)
@@ -820,17 +805,17 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
 
     def _pin_lora_in_cpu_cache(self, lora_id: int):
         try:
-            self._registered_loras.pin(lora_id)
+            self._registered_adapters.pin(lora_id)
         except ValueError as err:
             raise ValueError("Pinning failed. "
                              f"LoRA {lora_id} is not registered.") from err
 
     def _pin_lora_in_gpu_cache(self, lora_id: int):
-        if lora_id not in self._active_loras:
+        if lora_id not in self._active_adapters:
             # move lora to gpu if not already active
-            self.activate_lora(lora_id)
+            self.activate_adapter(lora_id)
 
-        self._active_loras.pin(lora_id)
+        self._active_adapters.pin(lora_id)
 
 
 def create_lora_manager(
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index 662774ffe..2d10d0377 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,13 +1,15 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from vllm.adapter_commons.request import AdapterRequest
+
 
 @dataclass
-class LoRARequest:
+class LoRARequest(AdapterRequest):
     """
     Request for a LoRA adapter.
 
-    Note that this class should be be used internally. For online
+    Note that this class should be used internally. For online
     serving, it is recommended to not allow users to use this class but
     instead provide another layer of abstraction to prevent users from
     accessing unauthorized LoRA adapters.
@@ -20,15 +22,16 @@ class LoRARequest:
     lora_int_id: int
     lora_local_path: str
     long_lora_max_len: Optional[int] = None
+    __hash__ = AdapterRequest.__hash__
 
-    def __post_init__(self):
-        if self.lora_int_id < 1:
-            raise ValueError(
-                f"lora_int_id must be > 0, got {self.lora_int_id}")
+    @property
+    def adapter_id(self):
+        return self.lora_int_id
 
-    def __eq__(self, value: object) -> bool:
-        return isinstance(
-            value, LoRARequest) and self.lora_int_id == value.lora_int_id
+    @property
+    def name(self):
+        return self.lora_name
 
-    def __hash__(self) -> int:
-        return self.lora_int_id
+    @property
+    def local_path(self):
+        return self.lora_local_path
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index ca4903c23..3d0ef4252 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,12 +1,15 @@
-from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Any, Dict, List, Literal, Optional, Set, Type, Union
 
 import torch
 
+from vllm.adapter_commons.utils import (add_adapter_worker,
+                                        apply_adapters_worker,
+                                        list_adapters_worker,
+                                        set_active_adapters_worker)
+from vllm.adapter_commons.worker_manager import AbstractWorkerManager
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
 from vllm.lora.models import (LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager, create_lora_manager)
 from vllm.lora.request import LoRARequest
@@ -14,79 +17,13 @@ from vllm.lora.request import LoRARequest
 logger = init_logger(__name__)
 
 
-class AbstractWorkerLoRAManager(ABC):
-    """Abstract class for managing LoRA models on the worker side."""
-
-    def __init__(self,
-                 max_num_seqs: int,
-                 max_num_batched_tokens: int,
-                 vocab_size: int,
-                 lora_config: LoRAConfig,
-                 device: torch.device,
-                 max_position_embeddings: Optional[int] = None):
-        self.max_num_seqs = max_num_seqs
-        self.max_num_batched_tokens = max_num_batched_tokens
-        self.max_position_embeddings = max_position_embeddings
-        self.vocab_size = vocab_size
-        self.device = device
-        self.lora_config = lora_config
-
-        # If False, do not cache. If None, cache is empty.
-        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
-
-    @contextmanager
-    def dummy_lora_cache(self):
-        """Use this context manager to reuse the dummy lora model
-        to avoid creating it repeatedly."""
-        self._cached_dummy_lora = None
-        yield
-        self._cached_dummy_lora = False
-
-    @property
-    @abstractmethod
-    def is_enabled(self) -> bool:
-        ...
-
-    @abstractmethod
-    def create_lora_manager(
-        self,
-        model: torch.nn.Module,
-    ) -> Any:
-        ...
-
-    @abstractmethod
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        ...
-
-    @abstractmethod
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        ...
-
-    @abstractmethod
-    def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
-        ...
-
-    @abstractmethod
-    def remove_lora(self, lora_id: int) -> bool:
-        ...
-
-    @abstractmethod
-    def remove_all_loras(self):
-        ...
-
-    @abstractmethod
-    def list_loras(self) -> Set[int]:
-        ...
-
-
-class WorkerLoRAManager(AbstractWorkerLoRAManager):
+class WorkerLoRAManager(AbstractWorkerManager):
     """WorkerLoRAManager that manages LoRA models on the worker side.
 
     Every request, the requested LoRAs will be loaded (unless they are already
     loaded), and every other LoRA will be unloaded."""
 
-    _lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager
+    _manager_cls: Type[LoRAModelManager] = LoRAModelManager
 
     def __init__(
         self,
@@ -103,16 +40,23 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
         self.embedding_padding_modules = embedding_padding_modules
+        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
+        self.max_num_seqs = max_num_seqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.vocab_size = vocab_size
+        self.lora_config = lora_config
+        self.max_position_embeddings = max_position_embeddings
+        super().__init__(device)
         # Lazily initialized by create_lora_manager.
-        self._lora_manager: LoRAModelManager
-        super().__init__(
-            max_num_seqs,
-            max_num_batched_tokens,
-            vocab_size,
-            lora_config,
-            device,
-            max_position_embeddings=max_position_embeddings,
-        )
+        self._adapter_manager: LoRAModelManager
+
+    @contextmanager
+    def dummy_lora_cache(self):
+        """Use this context manager to reuse the dummy lora model
+        to avoid creating it repeatedly."""
+        self._cached_dummy_lora = None
+        yield
+        self._cached_dummy_lora = False
 
     @property
     def is_enabled(self) -> bool:
@@ -128,41 +72,14 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
             max_num_batched_tokens=self.max_num_batched_tokens,
             vocab_size=self.vocab_size,
             lora_config=self.lora_config,
-            lora_manager_cls=self._lora_manager_cls,
+            lora_manager_cls=self._manager_cls,
         )
-        self._lora_manager = lora_manager
+        self._adapter_manager = lora_manager
         return lora_manager.model
 
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        self._apply_loras(lora_requests)
-        self._lora_manager.set_lora_mapping(lora_mapping)
-
-    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
-        loras_that_exist = self.list_loras()
-        loras_map = {
-            lora_request.lora_int_id: lora_request
-            for lora_request in lora_requests if lora_request
-        }
-        if len(loras_map) > self._lora_manager.lora_slots:
-            raise RuntimeError(
-                f"Number of requested LoRAs ({len(loras_map)}) is greater "
-                "than the number of GPU LoRA slots "
-                f"({self._lora_manager.lora_slots}).")
-
-        new_loras = set(loras_map)
-        loras_to_add = new_loras - loras_that_exist
-        loras_to_remove = loras_that_exist - new_loras
-
-        for lora_id in loras_to_remove:
-            self.remove_lora(lora_id)
-
-        for lora_id in loras_to_add:
-            self.add_lora(loras_map[lora_id])
-
-    def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
+    def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
         try:
-            model = self._lora_manager.model
+            model = self._adapter_manager.model
             supported_lora_modules = model.supported_lora_modules
             packed_modules_mapping = model.packed_modules_mapping
             expected_lora_modules: List[str] = []
@@ -198,37 +115,45 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
         return lora
 
     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
-        if lora_request.lora_int_id in self.list_loras():
+        if lora_request.lora_int_id in self.list_adapters():
             return False
         if isinstance(self._cached_dummy_lora, LoRAModel):
             dummy_lora = self._cached_dummy_lora.clone(
                 lora_request.lora_int_id)
         else:
-            dummy_lora = self._lora_manager.create_dummy_lora(
+            dummy_lora = self._adapter_manager.create_dummy_lora(
                 lora_request.lora_int_id, rank, 1, self.embedding_modules)
             if self._cached_dummy_lora is None:
                 self._cached_dummy_lora = dummy_lora
-        return self._lora_manager.add_lora(dummy_lora)
+        return self._adapter_manager.add_adapter(dummy_lora)
 
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if lora_request.lora_int_id in self.list_loras():
-            return False
-        lora = self._load_lora(lora_request)
-        loaded = self._lora_manager.add_lora(lora)
-        self._lora_manager.activate_lora(lora.id)
-        return loaded
+    def pin_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.pin_adapter(adapter_id)
+
+    def set_active_adapters(self, requests: Set[Any],
+                            mapping: Optional[Any]) -> None:
+        set_active_adapters_worker(requests, mapping, self._apply_adapters,
+                                   self._adapter_manager.set_adapter_mapping)
+
+    def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
+        apply_adapters_worker(adapter_requests, self.list_adapters,
+                              self._adapter_manager.adapter_slots,
+                              self.remove_adapter, self.add_adapter)
 
-    def remove_lora(self, lora_id: int) -> bool:
-        return self._lora_manager.remove_lora(lora_id)
+    def add_adapter(self, adapter_request: Any) -> bool:
+        return add_adapter_worker(adapter_request, self.list_adapters,
+                                  self._load_adapter,
+                                  self._adapter_manager.add_adapter,
+                                  self._adapter_manager.activate_adapter)
 
-    def pin_lora(self, lora_id: int) -> bool:
-        return self._lora_manager.pin_lora(lora_id)
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.remove_adapter(adapter_id)
 
-    def remove_all_loras(self):
-        self._lora_manager.remove_all_loras()
+    def remove_all_adapters(self):
+        self._adapter_manager.remove_all_adapters()
 
-    def list_loras(self) -> Set[int]:
-        return set(self._lora_manager.list_loras())
+    def list_adapters(self) -> Set[int]:
+        return list_adapters_worker(self._adapter_manager.list_adapters)
 
 
 class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
@@ -238,8 +163,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
     (unless they are already loaded) and least recently used LoRAs will
     be unloaded if the cache is above capacity."""
 
-    _lora_manager_cls: Type[
-        LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
+    _manager_cls: Type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
 
     def create_lora_manager(
         self,
@@ -247,40 +171,41 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
     ) -> Any:
         lora_manager = create_lora_manager(
             model,
-            lora_manager_cls=self._lora_manager_cls,
+            lora_manager_cls=self._manager_cls,
             max_num_seqs=self.max_num_seqs,
             vocab_size=self.vocab_size,
             lora_config=self.lora_config,
             max_num_batched_tokens=self.max_num_batched_tokens,
         )
-        self._lora_manager = lora_manager
+        self._adapter_manager = lora_manager
         return lora_manager.model
 
-    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
+    def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None:
         loras_map = {
             lora_request.lora_int_id: lora_request
             for lora_request in lora_requests if lora_request
         }
-        if len(loras_map) > self._lora_manager.lora_slots:
+        if len(loras_map) > self._adapter_manager.lora_slots:
             raise RuntimeError(
                 f"Number of requested LoRAs ({len(loras_map)}) is greater "
                 "than the number of GPU LoRA slots "
-                f"({self._lora_manager.lora_slots}).")
+                f"({self._adapter_manager.lora_slots}).")
         for lora in loras_map.values():
-            self.add_lora(lora)
+            self.add_adapter(lora)
 
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if lora_request.lora_int_id not in self.list_loras():
+    def add_adapter(self, lora_request: LoRARequest) -> bool:
+        if lora_request.lora_int_id not in self.list_adapters():
             # Remove before we load the new lora to save memory
-            if len(self._lora_manager) + 1 > self._lora_manager.capacity:
-                assert isinstance(self._lora_manager, LRUCacheLoRAModelManager)
-                self._lora_manager.remove_oldest_lora()
-            lora = self._load_lora(lora_request)
-            loaded = self._lora_manager.add_lora(lora)
+            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
+                assert isinstance(self._adapter_manager,
+                                  LRUCacheLoRAModelManager)
+                self._adapter_manager.remove_oldest_adapter()
+            lora = self._load_adapter(lora_request)
+            loaded = self._adapter_manager.add_adapter(lora)
         else:
             # If the lora is already loaded, just touch it to
             # update its position in the caches
-            loaded = self._lora_manager.get_lora(
+            loaded = self._adapter_manager.get_adapter(
                 lora_request.lora_int_id) is not None
-        self._lora_manager.activate_lora(lora_request.lora_int_id)
+        self._adapter_manager.activate_adapter(lora_request.lora_int_id)
         return loaded
diff --git a/vllm/prompt_adapter/__init__.py b/vllm/prompt_adapter/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
new file mode 100644
index 000000000..27a61e692
--- /dev/null
+++ b/vllm/prompt_adapter/layers.py
@@ -0,0 +1,80 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm.adapter_commons.layers import AdapterMapping
+from vllm.config import PromptAdapterConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+
+
+@dataclass
+class PromptAdapterMapping(AdapterMapping):
+    pass
+
+
+class VocabParallelEmbeddingWithPromptAdapter(nn.Module):
+
+    def __init__(self, base_layer: VocabParallelEmbedding) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.emb_layer = self.base_layer
+        if 'LoRA' in base_layer.__class__.__name__:
+            self.emb_layer = self.base_layer.base_layer
+
+    def create_prompt_adapter_weights(
+            self, prompt_adapter_config: PromptAdapterConfig):
+        self.embeddings_tensors = torch.zeros(
+            (
+                prompt_adapter_config.max_prompt_adapters,
+                prompt_adapter_config.max_prompt_adapter_token,
+                self.emb_layer.embedding_dim,
+            ),
+            dtype=self.emb_layer.weight.dtype,
+            device=self.emb_layer.weight.device,
+        )
+        self.adapter_lengths = torch.zeros(
+            prompt_adapter_config.max_prompt_adapters,
+            dtype=torch.long,
+            device=self.emb_layer.weight.device)
+
+        self.indices_gpu: torch.Tensor
+        self.embedding_indices_gpu: torch.Tensor
+
+    def reset_prompt_adapter(self, index: int):
+        self.embeddings_tensors[index] = 0
+
+    def set_prompt_adapter(
+        self,
+        index: int,
+        adapter_model: Optional[torch.Tensor],
+    ):
+        self.reset_prompt_adapter(index)
+        if adapter_model is not None:
+            length = adapter_model.shape[0]
+            self.embeddings_tensors[index, :length] = adapter_model
+            self.adapter_lengths[index] = length
+
+    def set_mapping(
+        self,
+        prompt_indices: torch.Tensor,
+        prompt_embedding_indices: torch.Tensor,
+    ):
+        self.indices_gpu = prompt_indices.to(
+            device=self.emb_layer.weight.device)
+        self.embedding_indices_gpu = prompt_embedding_indices.to(
+            device=self.emb_layer.weight.device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.base_layer(x)
+        if self.embedding_indices_gpu.ndim > 1:
+            valid_mask = self.indices_gpu != -1
+            gathered_embeddings = self.embeddings_tensors[
+                self.embedding_indices_gpu[:, 0],
+                self.embedding_indices_gpu[:, 1]]
+
+            # Update hidden states
+            hidden_states[valid_mask] = gathered_embeddings
+        return hidden_states
\ No newline at end of file
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
new file mode 100644
index 000000000..93eb3bde6
--- /dev/null
+++ b/vllm/prompt_adapter/models.py
@@ -0,0 +1,355 @@
+import logging
+import math
+from typing import Any, Callable, Dict, List, Optional, Type
+
+import torch
+from torch import nn
+
+from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
+                                         AdapterModelManager)
+from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
+                                        get_adapter, list_adapters,
+                                        remove_adapter, set_adapter_mapping)
+from vllm.config import PromptAdapterConfig
+from vllm.prompt_adapter.layers import (
+    VocabParallelEmbeddingWithPromptAdapter)  # yapf: disable
+from vllm.prompt_adapter.layers import PromptAdapterMapping
+
+logger = logging.getLogger(__name__)
+
+_GLOBAL_PROMPT_ADAPTER_ID = 0
+
+
+def get_prompt_adapter_id():
+    global _GLOBAL_PROMPT_ADAPTER_ID
+    _GLOBAL_PROMPT_ADAPTER_ID += 1
+    return _GLOBAL_PROMPT_ADAPTER_ID
+
+
+def convert_to_embedding_indices(indices):
+    embedding_indices = []
+    count = 0
+
+    for value in indices:
+        if value == -1:
+            count = 0
+        else:
+            embedding_indices.append([value, count])
+            count += 1
+
+    return torch.tensor(embedding_indices)
+
+
+def convert_mapping(
+    mapping: PromptAdapterMapping,
+    prompt_adapter_index_to_id: List[Optional[int]],
+) -> torch.Tensor:
+    """Converts PromptAdapterMapping to index tensors.
+
+    Args:
+        mapping: PromptAdapterMapping mapping rows in a 
+                batch to PromptAdapter ids.
+        prompt_adapter_index_to_id: List mapping PromptAdapter 
+                ids to PromptAdapter indices.
+        
+    Returns:
+        pa_indices: Tensor of shape [batch_size] mapping batch rows to
+            PromptAdapter indices.
+    """
+    id_to_index = {
+        id_: idx
+        for idx, id_ in enumerate(prompt_adapter_index_to_id)
+        if id_ is not None
+    }
+    pa_indices = ([
+        id_to_index.get(id_, -1) if id_ > 0 else -1
+        for id_ in mapping.index_mapping
+    ])
+
+    pa_embedding_mapping = convert_to_embedding_indices(pa_indices)
+    pa_indices = torch.tensor(pa_indices)
+    return pa_indices, pa_embedding_mapping
+
+
+class PromptAdapterModel(AdapterModel):
+
+    def __init__(self,
+                 prompt_adapter_id=None,
+                 num_virtual_tokens=None,
+                 prompt_embedding=None) -> None:
+        self.id = prompt_adapter_id
+        self.prompt_embedding = prompt_embedding
+        self.num_virtual_tokens = num_virtual_tokens
+
+    @classmethod
+    def from_local_checkpoint(
+        cls,
+        adapter_model_path: str,
+        prompt_adapter_id: int,
+        num_virtual_tokens: int,
+        config: PromptAdapterConfig,
+        device: str = "cuda",
+    ) -> "PromptAdapterModel":
+        from peft.utils import load_peft_weights
+
+        if num_virtual_tokens > config.max_prompt_adapter_token:
+            raise ValueError(
+                f'num_virtual_tokens ({num_virtual_tokens}) should be <= '
+                f'max_prompt_adapter_token({config.max_prompt_adapter_token})')
+
+        adapters_weights = load_peft_weights(adapter_model_path, device)
+        prompt_embedding = adapters_weights["prompt_embeddings"].to(
+            config.prompt_adapter_dtype)
+
+        return cls(prompt_adapter_id, num_virtual_tokens, prompt_embedding)
+
+
+class PromptAdapterModelManager(AdapterModelManager):
+    """A manager that manages multiple Prompt Adapter models."""
+
+    def __init__(
+        self,
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        prompt_adapter_config: PromptAdapterConfig,
+    ):
+        """Create a PromptAdapterModel and adapter for a given model.
+
+        Args:
+            model: the model to be adapted.
+            max_num_seqs: the maximum number of sequences model can run in a
+                single batch.
+            max_num_batched_tokens: the maximum number of tokens model can run
+                in a single batch.
+            prompt_adapter_config: the PromptAdapter config,
+        """
+        self.model: nn.Module = model
+        # Dict instead of a Set for compatibility with LRUCache.
+        self.prompt_adapter_index_to_id: List[
+            Optional[int]] = [None] * self.prompt_adapter_slots
+        self.max_num_seqs = max_num_seqs
+        self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
+        self.prompt_adapter_config = prompt_adapter_config
+        self.model.prompt_adapter_manager = self
+        self.adapter_type = 'PromptAdapter'
+
+        self.base_indices = torch.tensor([-1])
+        self.base_embedding_indices = torch.tensor([])
+
+        self.modules: Dict[str, nn.Module] = {}
+        self._create_prompt_adapter_modules()
+        self._last_mapping: Optional[PromptAdapterMapping] = None
+
+    @property
+    def prompt_adapter_slots(self) -> int:
+        return self.prompt_adapter_config.max_prompt_adapters
+
+    @property
+    def adapter_slots(self) -> int:
+        return self.prompt_adapter_slots
+
+    @property
+    def capacity(self) -> int:
+        return self.prompt_adapter_config.max_cpu_prompt_adapters
+
+    def activate_adapter(
+        self,
+        prompt_adapter_id: int,
+    ) -> bool:
+        """Move PromptAdapter into a GPU buffer 
+            to be used in the forward pass."""
+        if prompt_adapter_id in self._active_adapters:
+            return False
+        first_free_slot = next(
+            ((i, prompt_adapter_id) for i, prompt_adapter_id in enumerate(
+                self.prompt_adapter_index_to_id) if prompt_adapter_id is None),
+            None)
+        if first_free_slot is None:
+            raise ValueError("No free prompt_adapter slots")
+        index, _ = first_free_slot
+        self._active_adapters[prompt_adapter_id] = None
+        prompt_adapter_model = (self._registered_adapters[prompt_adapter_id])
+        logger.debug("Activating prompt_adapter. int id: %d, slot index: %d",
+                     prompt_adapter_model.id, index)
+        self.prompt_adapter_index_to_id[index] = prompt_adapter_model.id
+        for _, v in self.modules.items():
+            v.set_prompt_adapter(index, prompt_adapter_model.prompt_embedding)
+        return True
+
+    def _deactivate_adapter(self, prompt_adapter_id: int):
+        try:
+            index = self.prompt_adapter_index_to_id.index(prompt_adapter_id)
+            self.prompt_adapter_index_to_id[index] = None
+            for _, v in self.modules.items():
+                v.reset_prompt_adapter(index)
+        except ValueError:
+            pass
+
+    def _add_adapter(self, prompt_adapter: PromptAdapterModel):
+        self._registered_adapters[prompt_adapter.id] = prompt_adapter
+
+    def _set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None:
+        base_indices, base_embedding_indices = convert_mapping(
+            mapping, self.prompt_adapter_index_to_id)
+        for k, v in self.modules.items():
+            v.set_mapping(base_indices, base_embedding_indices)
+
+    def _create_prompt_adapter_modules(self):
+        for module_name, module in self.model.named_modules(
+                remove_duplicate=False):
+            if "VocabParallel" in module.__class__.__name__:
+                new_module = VocabParallelEmbeddingWithPromptAdapter(module)
+                new_module.create_prompt_adapter_weights(
+                    self.prompt_adapter_config)
+                replaced_module = self.replace_submodule(
+                    self.model, module_name, new_module)
+                self.register_module(module.__class__.__name__,
+                                     replaced_module)
+                replaced_module.set_mapping(self.base_indices,
+                                            self.base_embedding_indices)
+                break
+
+    def replace_submodule(self, model: nn.Module, module_name: str,
+                          new_module: nn.Module) -> nn.Module:
+        """Replace a submodule in a model with a new module."""
+        parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+        target_name = module_name.split(".")[-1]
+        setattr(parent, target_name, new_module)
+        return new_module
+
+    def register_module(self, module_name: str, module: nn.Module):
+        self.modules[module_name] = module
+
+    def pin_adapter(self, prompt_adapter_id: int) -> bool:
+        """Pin a PromptAdapterModel in the manager cache."""
+        raise NotImplementedError(
+            "Pinning is not supported in PromptAdapterModelManager."
+            "Use LRUCachePromptAdapterModelManager for pinning"
+        )  # type: ignore
+
+    def remove_all_adapters(self):
+        """Remove all PromptAdapterModel from the manager."""
+        self._registered_adapters.clear()
+        self.prompt_adapter_index_to_id = [None] * self.prompt_adapter_slots
+        self._active_adapters.clear()
+
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        return deactivate_adapter(adapter_id, self._active_adapters,
+                                  self._deactivate_adapter)
+
+    def add_adapter(self, adapter: PromptAdapterModel) -> bool:
+        return add_adapter(adapter, self._registered_adapters, self.capacity,
+                           self._add_adapter)
+
+    def set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None:
+        self._last_mapping = set_adapter_mapping(mapping, self._last_mapping,
+                                                 self._set_adapter_mapping)
+
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return remove_adapter(adapter_id, self._registered_adapters,
+                              self.deactivate_adapter)
+
+    def list_adapters(self) -> Dict[int, Any]:
+        return list_adapters(self._registered_adapters)
+
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        return get_adapter(adapter_id, self._registered_adapters)
+
+
+class PromptAdapterLRUCache(AdapterLRUCache[PromptAdapterModel]):
+
+    def __init__(self, capacity: int,
+                 deactivate_prompt_adapter_fn: Callable[[int], bool]):
+        super().__init__(capacity, deactivate_prompt_adapter_fn)
+
+
+class LRUCachePromptAdapterModelManager(PromptAdapterModelManager):
+    """A model manager that manages multiple prompt_adapters with LRU cache."""
+
+    def __init__(
+        self,
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        prompt_adapter_config: PromptAdapterConfig,
+    ):
+        self.prompt_adapter_config = prompt_adapter_config
+        super().__init__(model, max_num_seqs, max_num_batched_tokens,
+                         prompt_adapter_config)
+        self._registered_adapters = PromptAdapterLRUCache(
+            self.capacity, self.deactivate_adapter)
+        self._active_adapters = PromptAdapterLRUCache(
+            self.prompt_adapter_slots, self._deactivate_adapter)
+
+    def list_adapters(self) -> Dict[int, PromptAdapterModel]:
+        """List all registered PromptAdapterModel."""
+        return dict(self._registered_adapters.cache)
+
+    def add_adapter(self, prompt_adapter: PromptAdapterModel) -> bool:
+        """Add a PromptAdapterModel to the manager."""
+        if prompt_adapter.id not in self._registered_adapters:
+            self._add_adapter(prompt_adapter)
+            was_added = True
+        else:
+            # We always touch to update the LRU cache order
+            self._registered_adapters.touch(prompt_adapter.id)
+            was_added = False
+        return was_added
+
+    def activate_adapter(
+        self,
+        prompt_adapter_id: int,
+    ) -> bool:
+        if prompt_adapter_id not in self._active_adapters and len(
+                self._active_adapters) >= self.prompt_adapter_slots:
+            self._active_adapters.remove_oldest()
+        result = super().activate_adapter(prompt_adapter_id)
+        # We always touch to update the LRU cache order
+        self._active_adapters.touch(prompt_adapter_id)
+        return result
+
+    def remove_oldest_adapter(self) -> bool:
+        if len(self._registered_adapters) > 0:
+            self._registered_adapters.remove_oldest()
+            return True
+        return False
+
+    def pin_adapter(self, prompt_adapter_id: int) -> bool:
+        """Pin a PromptAdapterModel in the manager cache."""
+        self._pin_prompt_adapter_in_cpu_cache(prompt_adapter_id)
+        self._pin_prompt_adapter_in_gpu_cache(prompt_adapter_id)
+        return True
+
+    def _pin_prompt_adapter_in_cpu_cache(self, prompt_adapter_id: int):
+        try:
+            self._registered_adapters.pin(prompt_adapter_id)
+        except ValueError as err:
+            raise ValueError(
+                "Pinning failed. "
+                f"Prompt Adapter {prompt_adapter_id} is not registered."
+            ) from err
+
+    def _pin_prompt_adapter_in_gpu_cache(self, prompt_adapter_id: int):
+        if prompt_adapter_id not in self._active_adapters:
+            # move adapter to gpu if not already active
+            self.activate_adapter(prompt_adapter_id)
+        self._active_adapters.pin(prompt_adapter_id)
+
+
+def create_prompt_adapter_manager(
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        prompt_adapter_config: PromptAdapterConfig,
+        prompt_adapter_manager_cls: Type[
+            PromptAdapterModelManager] = PromptAdapterModelManager,
+        **kwargs) -> PromptAdapterModelManager:
+    """Create a PromptAdapterModel for a given model."""
+    prompt_adapter_manager = prompt_adapter_manager_cls(
+        model=model,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        prompt_adapter_config=prompt_adapter_config,
+        **kwargs)
+    return prompt_adapter_manager
diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
new file mode 100644
index 000000000..c0c98cf72
--- /dev/null
+++ b/vllm/prompt_adapter/request.py
@@ -0,0 +1,30 @@
+from dataclasses import dataclass
+
+from vllm.adapter_commons.request import AdapterRequest
+
+
+@dataclass
+class PromptAdapterRequest(AdapterRequest):
+    """
+    Request for a Prompt adapter.
+    """
+
+    prompt_adapter_name: str
+    prompt_adapter_id: int
+    prompt_adapter_local_path: str
+    prompt_adapter_num_virtual_tokens: int
+
+    def __hash__(self):
+        return super().__hash__()
+
+    @property
+    def adapter_id(self):
+        return self.prompt_adapter_id
+
+    @property
+    def name(self):
+        return self.prompt_adapter_name
+
+    @property
+    def local_path(self):
+        return self.prompt_adapter_local_path
diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
new file mode 100644
index 000000000..ddc1ef893
--- /dev/null
+++ b/vllm/prompt_adapter/worker_manager.py
@@ -0,0 +1,176 @@
+import logging
+from typing import Any, Optional, Set, Type
+
+import torch
+
+from vllm.adapter_commons.utils import (add_adapter_worker,
+                                        apply_adapters_worker,
+                                        list_adapters_worker,
+                                        set_active_adapters_worker)
+from vllm.adapter_commons.worker_manager import AbstractWorkerManager
+from vllm.config import PromptAdapterConfig
+from vllm.prompt_adapter.models import (LRUCachePromptAdapterModelManager,
+                                        PromptAdapterModel,
+                                        PromptAdapterModelManager,
+                                        create_prompt_adapter_manager)
+from vllm.prompt_adapter.request import PromptAdapterRequest
+
+logger = logging.getLogger(__name__)
+
+
+class WorkerPromptAdapterManager(AbstractWorkerManager):
+    """WorkerPromptAdapterManager that manages 
+    prompt_adapter models on the worker side.
+
+    Every request, the requested prompt_adapters will be 
+    loaded (unless they are already loaded), 
+    and every other prompt_adapter will be unloaded."""
+
+    _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager
+
+    def __init__(
+        self,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        device: torch.device,
+        prompt_adapter_config: PromptAdapterConfig,
+        prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel
+    ):
+        self._adapter_manager: PromptAdapterModelManager
+        self.max_num_seqs = max_num_seqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self._prompt_adapter_model_cls = prompt_adapter_model_cls
+        self.prompt_adapter_config = prompt_adapter_config
+        super().__init__(device)
+
+    @property
+    def is_enabled(self) -> bool:
+        return True
+
+    def create_prompt_adapter_manager(
+        self,
+        model: torch.nn.Module,
+    ) -> Any:
+        prompt_adapter_manager = create_prompt_adapter_manager(
+            model,
+            max_num_seqs=self.max_num_seqs,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            prompt_adapter_config=self.prompt_adapter_config,
+            prompt_adapter_manager_cls=self._manager_cls,
+        )
+        self._adapter_manager = prompt_adapter_manager
+        return prompt_adapter_manager.model
+
+    def _load_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest
+    ) -> PromptAdapterModel:
+        try:
+            prompt_adapter = (
+                self._prompt_adapter_model_cls.from_local_checkpoint(
+                    prompt_adapter_request.prompt_adapter_local_path,
+                    prompt_adapter_id=prompt_adapter_request.prompt_adapter_id,
+                    num_virtual_tokens=prompt_adapter_request.
+                    prompt_adapter_num_virtual_tokens,
+                    config=self.prompt_adapter_config,
+                    device=str(self.device),
+                ))
+        except Exception as e:
+            raise RuntimeError(
+                f"Loading prompt_adapter "
+                f"{prompt_adapter_request.prompt_adapter_local_path}"
+                f" failed") from e
+        return prompt_adapter
+
+    def add_dummy_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return True
+
+    def pin_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.pin_adapter(adapter_id)
+
+    def set_active_adapters(self, requests: Set[Any],
+                            mapping: Optional[Any]) -> None:
+        set_active_adapters_worker(requests, mapping, self._apply_adapters,
+                                   self._adapter_manager.set_adapter_mapping)
+
+    def add_adapter(self, adapter_request: Any) -> bool:
+        return add_adapter_worker(adapter_request, self.list_adapters,
+                                  self._load_adapter,
+                                  self._adapter_manager.add_adapter,
+                                  self._adapter_manager.activate_adapter)
+
+    def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
+        apply_adapters_worker(adapter_requests, self.list_adapters,
+                              self._adapter_manager.adapter_slots,
+                              self.remove_adapter, self.add_adapter)
+
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.remove_adapter(adapter_id)
+
+    def remove_all_adapters(self):
+        self._adapter_manager.remove_all_adapters()
+
+    def list_adapters(self) -> Set[int]:
+        return list_adapters_worker(self._adapter_manager.list_adapters)
+
+
+class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager):
+    """WorkerPromptAdapterManager that manages 
+    prompt_adapter models on the worker side.
+
+    Uses an LRU Cache. Every request, the requested 
+    prompt_adapters will be loaded (unless they are already loaded) 
+    and least recently used prompt_adapters will
+    be unloaded if the cache is above capacity."""
+
+    _prompt_adapter_manager_cls: Type[
+        LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager
+
+    def create_prompt_adapter_manager(
+        self,
+        model: torch.nn.Module,
+    ) -> Any:
+        prompt_adapter_manager = create_prompt_adapter_manager(
+            model,
+            max_num_seqs=self.max_num_seqs,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            prompt_adapter_config=self.prompt_adapter_config,
+            prompt_adapter_manager_cls=self._prompt_adapter_manager_cls)
+        self._adapter_manager: LRUCachePromptAdapterModelManager = (
+            prompt_adapter_manager)
+        return prompt_adapter_manager.model
+
+    def _apply_adapters(
+            self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None:
+        prompt_adapters_map = {
+            prompt_adapter_request.prompt_adapter_id: prompt_adapter_request
+            for prompt_adapter_request in prompt_adapter_requests
+            if prompt_adapter_request
+        }
+        if len(prompt_adapters_map
+               ) > self._adapter_manager.prompt_adapter_slots:
+            raise RuntimeError(
+                f"Number of requested prompt_adapters "
+                f"({len(prompt_adapters_map)}) is greater "
+                "than the number of GPU prompt_adapter slots "
+                f"({self._adapter_manager.prompt_adapter_slots}).")
+        for prompt_adapter in prompt_adapters_map.values():
+            self.add_adapter(prompt_adapter)
+
+    def add_adapter(self,
+                    prompt_adapter_request: PromptAdapterRequest) -> bool:
+        if prompt_adapter_request.prompt_adapter_id not in self.list_adapters(
+        ):
+            # Remove before we load the new prompt_adapter to save memory
+            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
+                self._adapter_manager.remove_oldest_adapter()
+            prompt_adapter = self._load_adapter(prompt_adapter_request)
+            loaded = self._adapter_manager.add_adapter(prompt_adapter)
+        else:
+            # If the prompt_adapter is already loaded, just touch it to
+            # update its position in the caches
+            loaded = self._adapter_manager.get_adapter(
+                prompt_adapter_request.prompt_adapter_id) is not None
+        self._adapter_manager.activate_adapter(
+            prompt_adapter_request.prompt_adapter_id)
+        return loaded
diff --git a/vllm/sequence.py b/vllm/sequence.py
index d200115aa..a3f998b94 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -10,6 +10,7 @@ import torch
 
 from vllm.lora.request import LoRARequest
 from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
@@ -238,21 +239,25 @@ class Sequence:
         block_size: The block size of the sequence. Should be the same as the
             block size used by the block manager and cache engine.
         lora_request: LoRA request.
+        prompt_adapter_request: Prompt Adapter request.
+
     """
 
     def __init__(
-        self,
-        seq_id: int,
-        inputs: "LLMInputs",
-        block_size: int,
-        eos_token_id: Optional[int] = None,
-        lora_request: Optional[LoRARequest] = None,
+            self,
+            seq_id: int,
+            inputs: "LLMInputs",
+            block_size: int,
+            eos_token_id: Optional[int] = None,
+            lora_request: Optional[LoRARequest] = None,
+            prompt_adapter_request: Optional[PromptAdapterRequest] = None
     ) -> None:
         self.seq_id = seq_id
         self.inputs = inputs
         self.block_size = block_size
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
+        self.prompt_adapter_request = prompt_adapter_request
 
         self.data = SequenceData(self.prompt_token_ids)
         self.output_logprobs: SampleLogprobs = []
@@ -287,6 +292,11 @@ class Sequence:
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    @property
+    def prompt_adapter_id(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_id \
+                        if self.prompt_adapter_request else 0
+
     def get_output_text_to_return(self, buffer_length: int):
         # We return the full output text if the sequence is finished.
         truncate = buffer_length and not self.is_finished()
@@ -414,6 +424,7 @@ class SequenceGroup:
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
         trace_headers: OpenTelemetry trace headers.
+        prompt_adapter_request: Prompt Adapter request.
     """
 
     def __init__(
@@ -427,6 +438,7 @@ class SequenceGroup:
         pooling_params: Optional[PoolingParams] = None,
         encoder_seq: Optional[Sequence] = None,
         trace_headers: Optional[Dict[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -441,6 +453,7 @@ class SequenceGroup:
         self.state = SequenceGroupState()
         self.embeddings = embeddings
         self.pooling_params = pooling_params
+        self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
 
@@ -466,6 +479,16 @@ class SequenceGroup:
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    @property
+    def prompt_adapter_id(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_id \
+                        if self.prompt_adapter_request else 0
+
+    @property
+    def prompt_adapter_num_virtual_tokens(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\
+                         if self.prompt_adapter_request else 0
+
     def get_last_latency(self, now: float) -> Optional[float]:
         """Sets the last token time for Request level timings."""
         # If still in prefill phase, raise Error.
@@ -624,6 +647,7 @@ class SequenceGroupMetadata:
                            (SequenceGroup.encoder_seq). Should be None
                            unless you are working with an encoder/decoder
                            model.
+        prompt_adapter_request: Prompt Adapter request.
     """
 
     def __init__(
@@ -642,6 +666,7 @@ class SequenceGroupMetadata:
         multi_modal_data: Optional["MultiModalDataDict"] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -650,6 +675,7 @@ class SequenceGroupMetadata:
         self.block_tables = block_tables
         self.pooling_params = pooling_params
         self.lora_request = lora_request
+        self.prompt_adapter_request = prompt_adapter_request
         self.computed_block_nums = computed_block_nums
         self.multi_modal_data = multi_modal_data
         self.state = SequenceGroupState() if state is None else state
@@ -674,6 +700,16 @@ class SequenceGroupMetadata:
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    @property
+    def prompt_adapter_id(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_id \
+                        if self.prompt_adapter_request else 0
+
+    @property
+    def prompt_adapter_num_virtual_tokens(self) -> int:
+        return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \
+                        if self.prompt_adapter_request else 0
+
     @property
     def token_chunk_size(self) -> int:
         """Return the number of tokens to be processed (chunk size)."""
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 6a2cfc819..90bba96ee 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -4,7 +4,7 @@ import torch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
@@ -48,6 +48,7 @@ class TP1DraftModelRunner(ModelRunner):
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         multimodal_config: Optional[MultiModalConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         return_hidden_states: bool = False,
     ):
         if return_hidden_states:
@@ -66,6 +67,7 @@ class TP1DraftModelRunner(ModelRunner):
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
             multimodal_config=multimodal_config,
+            prompt_adapter_config=prompt_adapter_config,
             return_hidden_states=return_hidden_states,
         )
 
@@ -136,6 +138,13 @@ class TP1DraftModelRunner(ModelRunner):
             self.set_active_loras(model_input.lora_requests,
                                   model_input.lora_mapping)
 
+        if self.prompt_adapter_config:
+            assert model_input.prompt_adapter_requests is not None
+            assert model_input.prompt_adapter_mapping is not None
+            self.set_active_prompt_adapters(
+                model_input.prompt_adapter_requests,
+                model_input.prompt_adapter_mapping)
+
         virtual_engine = model_input.virtual_engine
         outputs: List[SamplerOutput] = []
         for step in range(num_steps):
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index b4277ae82..db0e178e4 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -8,7 +8,7 @@ from torch import nn
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
@@ -81,6 +81,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
         lora_config: Optional[LoRAConfig],
         multimodal_config: Optional[MultiModalConfig],
         kv_cache_dtype: Optional[str] = "auto",
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         *args,
         **kwargs,
@@ -94,6 +95,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
         self.cache_config = cache_config
         self.lora_config = lora_config
         self.multimodal_config = multimodal_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 657505739..3c22c7326 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -7,7 +7,7 @@ import torch.distributed
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -133,6 +133,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         lora_config: Optional[LoRAConfig] = None,
         multimodal_config: Optional[MultiModalConfig] = None,
         kv_cache_dtype: Optional[str] = "auto",
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         self.model_config = model_config
@@ -145,6 +146,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
@@ -167,6 +169,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
             lora_config=self.lora_config,
             multimodal_config=self.multimodal_config,
             kv_cache_dtype=kv_cache_dtype,
+            prompt_adapter_config=self.prompt_adapter_config,
             is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index a3b31a1c0..a333e6634 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -5,7 +5,7 @@ import torch
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.pooling_params import PoolingParams
@@ -40,6 +40,7 @@ class EmbeddingModelRunner(
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         multimodal_config: Optional[MultiModalConfig] = None,
     ):
         super().__init__(model_config,
@@ -51,6 +52,7 @@ class EmbeddingModelRunner(
                          lora_config=lora_config,
                          kv_cache_dtype=kv_cache_dtype,
                          is_driver_worker=is_driver_worker,
+                         prompt_adapter_config=prompt_adapter_config,
                          multimodal_config=multimodal_config)
 
     @torch.inference_mode()
@@ -71,6 +73,13 @@ class EmbeddingModelRunner(
             self.set_active_loras(model_input.lora_requests,
                                   model_input.lora_mapping)
 
+        if self.prompt_adapter_config:
+            assert model_input.prompt_adapter_requests is not None
+            assert model_input.prompt_adapter_mapping is not None
+            self.set_active_prompt_adapters(
+                model_input.prompt_adapter_requests,
+                model_input.prompt_adapter_mapping)
+
         # Currently cuda graph is only supported by the decode phase.
         assert model_input.attn_metadata is not None
         prefill_meta = model_input.attn_metadata.prefill_metadata
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index d0c82d6bb..205b4f58f 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -25,7 +25,7 @@ except ImportError:
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.inputs import INPUT_REGISTRY
@@ -40,6 +40,10 @@ from vllm.model_executor.models.interfaces import (supports_lora,
                                                    supports_vision)
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
                              MultiModalInputs)
+from vllm.prompt_adapter.layers import PromptAdapterMapping
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.prompt_adapter.worker_manager import (
+    LRUCacheWorkerPromptAdapterManager)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
@@ -85,6 +89,8 @@ class ModelInputForGPU(ModelRunnerInputBase):
     lora_mapping: Optional["LoRAMapping"] = None
     lora_requests: Optional[Set[LoRARequest]] = None
     attn_metadata: Optional["AttentionMetadata"] = None
+    prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
+    prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
     multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
     request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
     finished_requests_ids: Optional[List[str]] = None
@@ -97,6 +103,8 @@ class ModelInputForGPU(ModelRunnerInputBase):
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
+            "prompt_adapter_mapping": self.prompt_adapter_mapping,
+            "prompt_adapter_requests": self.prompt_adapter_requests,
             "virtual_engine": self.virtual_engine,
             "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
             "finished_requests_ids": self.finished_requests_ids,
@@ -133,6 +141,8 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
+            "prompt_adapter_mapping": self.prompt_adapter_mapping,
+            "prompt_adapter_requests": self.prompt_adapter_requests,
             "virtual_engine": self.virtual_engine,
             "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
             "finished_requests_ids": self.finished_requests_ids,
@@ -172,6 +182,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         multimodal_config: Optional[MultiModalConfig] = None,
         return_hidden_states: bool = False,
     ):
@@ -183,6 +194,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         self.lora_config = lora_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
+        self.prompt_adapter_config = prompt_adapter_config
         self.multimodal_config = multimodal_config
         self.return_hidden_states = return_hidden_states
 
@@ -232,6 +244,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         self.model: nn.Module  # Set after load_model
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
+        self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None
 
         self.flashinfer_decode_workspace_buffer = None
         self.flashinfer_decode_wrapper = None
@@ -240,16 +253,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
     def load_model(self) -> None:
         with CudaMemoryProfiler() as m:
-            self.model = get_model(
-                model_config=self.model_config,
-                device_config=self.device_config,
-                load_config=self.load_config,
-                lora_config=self.lora_config,
-                multimodal_config=self.multimodal_config,
-                parallel_config=self.parallel_config,
-                scheduler_config=self.scheduler_config,
-                cache_config=self.cache_config,
-            )
+            self.model = get_model(model_config=self.model_config,
+                                   device_config=self.device_config,
+                                   load_config=self.load_config,
+                                   lora_config=self.lora_config,
+                                   multimodal_config=self.multimodal_config,
+                                   parallel_config=self.parallel_config,
+                                   scheduler_config=self.scheduler_config,
+                                   cache_config=self.cache_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",
@@ -274,6 +285,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             )
             self.model = self.lora_manager.create_lora_manager(self.model)
 
+        if self.prompt_adapter_config:
+            self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
+                self.scheduler_config.max_num_seqs,
+                self.scheduler_config.max_num_batched_tokens, self.device,
+                self.prompt_adapter_config)
+            self.model = (
+                self.prompt_adapter_manager.create_prompt_adapter_manager(
+                    self.model))
+
         if self.kv_cache_dtype == "fp8" and is_hip():
             # Currently only ROCm accepts kv-cache scaling factors
             # via quantization_param_path and this will be deprecated
@@ -354,6 +374,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         lora_index_mapping: List[int] = []
         lora_prompt_mapping: List[int] = []
         lora_requests: Set[LoRARequest] = set()
+        prompt_adapter_index_mapping: List[int] = []
+        prompt_adapter_prompt_mapping: List[int] = []
+        prompt_adapter_requests: Set[PromptAdapterRequest] = set()
 
         seq_lens: List[int] = []
         prefill_seq_lens: List[int] = []
@@ -504,6 +527,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 input_tokens.extend(tokens)
                 input_positions.extend(list(range(context_len, seq_len)))
                 lora_id = seq_group_metadata.lora_int_id
+                prompt_adapter_id = seq_group_metadata.prompt_adapter_id
 
                 if is_prompt:
                     assert len(seq_ids) == 1
@@ -534,6 +558,21 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     mm_kwargs = self.multi_modal_input_mapper(mm_data)
                     multi_modal_inputs_list.append(mm_kwargs)
 
+                if prompt_adapter_id > 0 and is_prompt:
+                    prompt_adapter_requests.add(
+                        seq_group_metadata.prompt_adapter_request)
+
+                    num_tokens = seq_group_metadata.\
+                                            prompt_adapter_num_virtual_tokens
+                    pm = [prompt_adapter_id
+                          ] * num_tokens + [0] * (query_len - num_tokens)
+                    prompt_adapter_index_mapping += pm
+                    prompt_adapter_prompt_mapping.extend(
+                        [prompt_adapter_id] *
+                        (query_len if seq_group_metadata.sampling_params
+                         and seq_group_metadata.sampling_params.prompt_logprobs
+                         else 1))
+
                 is_profile_run = _is_block_tables_empty(
                     seq_group_metadata.block_tables)
                 if is_profile_run:
@@ -618,12 +657,11 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 seq_lens.append(1)
                 block_tables.append([])
                 lora_index_mapping.append(0)
-
+                prompt_adapter_index_mapping.append(0)
                 if self.attn_backend.get_name() == "flashinfer":
                     last_paged_kv_indptr = paged_kv_indptr[-1]
                     paged_kv_indptr.append(last_paged_kv_indptr)
                     paged_kv_last_page_len.append(0)
-
             batch_size = graph_batch_size
             num_decode_tokens = batch_size
 
@@ -759,6 +797,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         else:
             lora_mapping = None
 
+        if self.prompt_adapter_config:
+            prompt_adapter_mapping = PromptAdapterMapping(
+                prompt_adapter_index_mapping,
+                prompt_adapter_prompt_mapping,
+            )
+        else:
+            prompt_adapter_mapping = None
+
         multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
                                                     device=self.device)
         request_ids_to_seq_ids = {
@@ -776,7 +822,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             lora_requests=lora_requests,
             multi_modal_kwargs=multi_modal_kwargs,
             request_ids_to_seq_ids=request_ids_to_seq_ids,
-            finished_requests_ids=finished_requests_ids)
+            finished_requests_ids=finished_requests_ids,
+            prompt_adapter_mapping=prompt_adapter_mapping,
+            prompt_adapter_requests=prompt_adapter_requests,
+        )
 
     @torch.inference_mode()
     def profile_run(self) -> None:
@@ -878,33 +927,67 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
     def remove_all_loras(self):
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.remove_all_loras()
+        self.lora_manager.remove_all_adapters()
 
     def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.set_active_loras(lora_requests, lora_mapping)
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.add_lora(lora_request)
+        return self.lora_manager.add_adapter(lora_request)
 
     def remove_lora(self, lora_id: int) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_lora(lora_id)
+        return self.lora_manager.remove_adapter(lora_id)
 
     def pin_lora(self, lora_id: int) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.pin_lora(lora_id)
+        return self.lora_manager.pin_adapter(lora_id)
 
     def list_loras(self) -> Set[int]:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.list_loras()
+        return self.lora_manager.list_adapters()
+
+    def remove_all_prompt_adapters(self):
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        self.prompt_adapter_manager.remove_all_adapters()
+
+    def set_active_prompt_adapters(
+            self, prompt_adapter_requests: Set[PromptAdapterRequest],
+            prompt_adapter_mapping: PromptAdapterMapping) -> None:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        self.prompt_adapter_manager.set_active_adapters(
+            prompt_adapter_requests, prompt_adapter_mapping)
+
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        return self.prompt_adapter_manager.add_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        return self.prompt_adapter_manager.remove_adapter(prompt_adapter_id)
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        return self.prompt_adapter_manager.pin_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        if not self.prompt_adapter_manager:
+            raise RuntimeError("PromptAdapter is not enabled.")
+        return self.prompt_adapter_manager.list_adapters()
 
     @torch.inference_mode()
     def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
@@ -1063,6 +1146,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                         )
                         self.set_active_loras(set(), lora_mapping)
 
+                    if self.prompt_adapter_config:
+                        prompt_adapter_mapping = PromptAdapterMapping(
+                            [-1] * batch_size,
+                            [-1] * batch_size,
+                        )
+                        self.set_active_prompt_adapters(
+                            set(), prompt_adapter_mapping)
+
                     graph_runner = CUDAGraphRunner(
                         self.model, self.attn_backend.get_name())
 
@@ -1189,6 +1280,13 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
             self.set_active_loras(model_input.lora_requests,
                                   model_input.lora_mapping)
 
+        if self.prompt_adapter_config:
+            assert model_input.prompt_adapter_requests is not None
+            assert model_input.prompt_adapter_mapping is not None
+            self.set_active_prompt_adapters(
+                model_input.prompt_adapter_requests,
+                model_input.prompt_adapter_mapping)
+
         if self.attn_backend.get_name() == "flashinfer":
             assert model_input.attn_metadata is not None
             assert model_input.input_tokens is not None
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 58707269b..857cd86be 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -8,7 +8,8 @@ import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig)
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
@@ -16,6 +17,7 @@ from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.platforms import current_platform
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
@@ -45,6 +47,7 @@ class Worker(LocalOrDistributedWorkerBase):
         lora_config: Optional[LoRAConfig] = None,
         multimodal_config: Optional[MultiModalConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
     ) -> None:
@@ -59,6 +62,7 @@ class Worker(LocalOrDistributedWorkerBase):
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
         self.load_config = load_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
         if parallel_config and is_driver_worker:
             assert rank % parallel_config.tensor_parallel_size == 0, \
@@ -92,6 +96,7 @@ class Worker(LocalOrDistributedWorkerBase):
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
+            prompt_adapter_config=prompt_adapter_config,
             multimodal_config=multimodal_config,
             **speculative_args,
         )
@@ -296,6 +301,19 @@ class Worker(LocalOrDistributedWorkerBase):
     def list_loras(self) -> Set[int]:
         return self.model_runner.list_loras()
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.model_runner.add_prompt_adapter(prompt_adapter_request)
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_runner.remove_lora(prompt_adapter_id)
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
+
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.model_runner.list_prompt_adapters()
+
     @property
     def max_model_len(self) -> int:
         return self.model_config.max_model_len
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 03b9cce5a..e03f24fdf 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -8,7 +8,7 @@ import torch.nn as nn
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.distributed import broadcast_tensor_dict
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
@@ -88,6 +88,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         lora_config: Optional[LoRAConfig],
         multimodal_config: Optional[MultiModalConfig],
         kv_cache_dtype: Optional[str] = "auto",
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
         *args,
         **kwargs,
@@ -98,6 +99,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
         self.lora_config = lora_config
         self.load_config = load_config
         self.cache_config = cache_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.multimodal_config = multimodal_config
         self.is_driver_worker = is_driver_worker
 
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 94dfcfec3..6a822c2ba 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -10,7 +10,8 @@ import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig)
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
@@ -47,6 +48,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
         lora_config: Optional[LoRAConfig] = None,
         multimodal_config: Optional[MultiModalConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         assert device_config.device_type == "xpu"
@@ -63,6 +65,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
-- 
GitLab


From 673dd4cae9340e78dd5c05843e41c38133aa29a6 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Tue, 9 Jul 2024 16:24:58 -0700
Subject: [PATCH 295/376] [Docs] Docs update for Pipeline Parallel (#6222)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 docs/source/serving/distributed_serving.rst | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 91f64ad2e..3c58ed295 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -3,7 +3,7 @@
 Distributed Inference and Serving
 =================================
 
-vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_.  We also support pipeline parallel as a beta feature for online serving. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
 
 Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
 
@@ -23,6 +23,19 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
     $     --model facebook/opt-13b \
     $     --tensor-parallel-size 4
 
+You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
+
+.. code-block:: console
+
+    $ python -m vllm.entrypoints.openai.api_server \
+    $     --model gpt2 \
+    $     --tensor-parallel-size 4 \
+    $     --pipeline-parallel-size 2 \
+    $     --distributed-executor-backend ray
+
+.. note::
+    Pipeline parallel is a beta feature. It is only supported for online serving and the ray backend for now, as well as LLaMa and GPT2 style models.
+
 To scale vLLM beyond a single machine, install and start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
 
 .. code-block:: console
@@ -35,7 +48,7 @@ To scale vLLM beyond a single machine, install and start a `Ray runtime <https:/
     $ # On worker nodes
     $ ray start --address=<ray-head-address>
 
-After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
+After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` multiplied by :code:`pipeline_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
 
 .. warning::
     Please make sure you downloaded the model to all the nodes, or the model is downloaded to some distributed file system that is accessible by all nodes.
-- 
GitLab


From d3a245138acb358c7e1e5c5dcf4dcb3c2b48c8ff Mon Sep 17 00:00:00 2001
From: Baoyuan Qi <qibaoyuan@126.com>
Date: Wed, 10 Jul 2024 07:43:24 +0800
Subject: [PATCH 296/376] [Bugfix]fix and needs_scalar_to_array logic check
 (#6238)

Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
---
 vllm/model_executor/layers/linear.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 3cc257834..1dda5d374 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -387,7 +387,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         if loaded_shard_id is None:
             # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
-                if needs_scalar_to_array is not None:
+                if needs_scalar_to_array:
                     param_data, loaded_weight = adjust_scalar_to_fused_array(
                         param_data, loaded_weight, 0)
 
@@ -549,7 +549,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         if loaded_shard_id is None:
             # Loaded weight is already fused on disk (qkv/mlp).
             if output_dim is None:
-                if needs_scalar_to_array is not None:
+                if needs_scalar_to_array:
                     param_data, loaded_weight = adjust_scalar_to_fused_array(
                         param_data, loaded_weight, 0)
 
-- 
GitLab


From 2416b26e119b9d1932ba30790ecaddfac1ae4143 Mon Sep 17 00:00:00 2001
From: Abhinav Goyal <abhinav.goyal@flipkart.com>
Date: Wed, 10 Jul 2024 07:04:02 +0530
Subject: [PATCH 297/376] [Speculative Decoding] Medusa Implementation with
 Top-1 proposer (#4978)

---
 .../e2e/test_medusa_correctness.py            | 226 ++++++++++++++++++
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/medusa.py          | 159 ++++++++++++
 vllm/spec_decode/medusa_worker.py             | 127 ++++++++++
 vllm/spec_decode/spec_decode_worker.py        |   5 +
 vllm/transformers_utils/config.py             |   6 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/medusa.py     |  60 +++++
 vllm/worker/worker.py                         |   5 +-
 9 files changed, 587 insertions(+), 4 deletions(-)
 create mode 100644 tests/spec_decode/e2e/test_medusa_correctness.py
 create mode 100644 vllm/model_executor/models/medusa.py
 create mode 100644 vllm/spec_decode/medusa_worker.py
 create mode 100644 vllm/transformers_utils/configs/medusa.py

diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
new file mode 100644
index 000000000..7e4a6cc62
--- /dev/null
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -0,0 +1,226 @@
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, Medusa would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+# main model
+# lmsys/vicuna-7b-v1.3 was to be used but it's causing
+# OOM in CI pipeline, so using a smaller model.
+MAIN_MODEL = "JackFram/llama-68m"
+
+# speculative model
+SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random"
+
+# max. number of speculative tokens: this corresponds to
+# num_heads in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 5
+
+# precision
+PRECISION = "float32"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int):
+    """Verify greedy equality with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                    test_llm_generator,
+                                                    batch_size: int,
+                                                    output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": SPEC_MODEL,
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+                         batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": SPEC_MODEL,
+                             "num_speculative_tokens": MAX_SPEC_TOKENS,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 644b95aae..096e3f472 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -64,6 +64,7 @@ _GENERATION_MODELS = {
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
+    "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM")
 }
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
new file mode 100644
index 000000000..6453d0cb2
--- /dev/null
+++ b/vllm/model_executor/models/medusa.py
@@ -0,0 +1,159 @@
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs.medusa import MedusaConfig
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, hidden_size: int, num_layers: int) -> None:
+        super().__init__()
+
+        self.layers = nn.ModuleList([
+            nn.Linear(hidden_size, hidden_size, bias=False)
+            for _ in range(num_layers)
+        ])
+        self.act = nn.SiLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            x = x + self.act(layer(x))
+        return x
+
+
+class Medusa(nn.Module):
+
+    def __init__(self, config: MedusaConfig, **_) -> None:
+        super().__init__()
+        self.config = config
+        self.blocks = nn.ModuleList([
+            ResidualBlock(hidden_size=self.config.hidden_size,
+                          num_layers=self.config.num_hidden_layers)
+            for _ in range(self.config.num_heads)
+        ])
+        self.orig_vocab_size = config.vocab_size
+        self.truncated_vocab_size = config.truncated_vocab_size
+        self.unpadded_vocab_size = self.truncated_vocab_size
+
+        self.lm_heads = nn.ModuleList([
+            ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.truncated_vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            ) for _ in range(self.config.num_heads)
+        ])
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.truncated_vocab_size,
+                                                logit_scale)
+
+        self.token_map = None
+
+    def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
+        return [block(hidden_states) for block in self.blocks]
+
+    def compute_logits(
+            self, hidden_states: List[torch.Tensor],
+            sampling_metadata: SamplingMetadata) -> List[torch.Tensor]:
+        logits = []
+
+        for hs, lm_head in zip(hidden_states, self.lm_heads):
+            _logits = self.logits_processor(lm_head, hs, sampling_metadata)
+
+            if self.token_map is None:
+                logits.append(_logits)
+            else:
+                logits.append(-torch.inf * torch.ones(
+                    size=(*_logits.shape[:-1], self.orig_vocab_size),
+                    device=_logits.device,
+                    dtype=_logits.dtype))
+
+                logits[-1][..., self.token_map] = _logits
+
+        return logits
+
+    def sample(
+        self,
+        logits: List[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> List[SamplerOutput]:
+        logits = torch.stack(logits, dim=0).float()
+        logprobs = torch.log_softmax(logits, dim=-1)
+        token_ids = logits.argmax(-1)  # support only top-1 for now
+        probs = torch.softmax(logits, dim=-1)
+
+        token_id_list = []
+        token_prob_list = []
+        token_logprob_list = []
+
+        for idx, seq_group in enumerate(sampling_metadata.seq_groups):
+            token_id_list.append(token_ids[:, seq_group.sample_indices])
+            token_prob_list.append(probs[:, seq_group.sample_indices])
+            token_logprob_list.append(logprobs[:, seq_group.sample_indices])
+
+        outputs: List[Optional[SamplerOutput]] = []
+        for idx in range(len(sampling_metadata.seq_groups)):
+            outputs.append(
+                SamplerOutput(
+                    outputs=None,
+                    sampled_token_probs=token_prob_list[idx].squeeze(1),
+                    logprobs=token_logprob_list[idx].squeeze(1),
+                    sampled_token_ids=token_id_list[idx].squeeze(1),
+                ))
+
+        return outputs
+
+    def generate_proposals(
+        self,
+        previous_hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> List[SamplerOutput]:
+        return self.sample(
+            logits=self.compute_logits(
+                hidden_states=self.forward(previous_hidden_states),
+                sampling_metadata=sampling_metadata,
+            ),
+            sampling_metadata=sampling_metadata,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+
+        weights_map = {}
+
+        for name, loaded_weight in weights:
+            name = name.replace("medusa_heads.", "")
+
+            if name == "token_map":
+                if self.truncated_vocab_size < self.orig_vocab_size:
+                    self.token_map = nn.Parameter(loaded_weight,
+                                                  requires_grad=False)
+            elif name in params_dict:
+                weights_map[name] = loaded_weight
+
+        for name, loaded_weight in weights_map.items():
+            if "lm_head" in name and self.token_map is not None and\
+                loaded_weight.shape[0] > self.token_map.shape[0]:
+
+                loaded_weight = loaded_weight[self.token_map]
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        if self.token_map is not None:
+            self.token_map.to(device=self.lm_heads[0].weight.device)
+
+        assert (self.truncated_vocab_size
+                == self.orig_vocab_size) or (self.token_map is not None)
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
new file mode 100644
index 000000000..b72740fc3
--- /dev/null
+++ b/vllm/spec_decode/medusa_worker.py
@@ -0,0 +1,127 @@
+import weakref
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.model_executor import SamplingMetadata
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+                           SequenceGroupMetadata)
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
+from vllm.spec_decode.top1_proposer import Top1Proposer
+from vllm.worker.worker import Worker
+
+
+class MedusaWorker(NonLLMProposerWorkerBase, Worker):
+    """Worker for Medusa.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Lazy initialization list.
+        self._proposer: Top1Proposer
+
+    def init_device(self):
+        super().init_device()
+
+        self._proposer = Top1Proposer(
+            weakref.proxy(self),  # type: ignore[arg-type]
+            self.device,
+            self.vocab_size,
+            max_proposal_len=self.max_model_len,
+        )
+
+    def set_include_gpu_probs_tensor(self):
+        pass
+
+    @torch.inference_mode()
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+    ) -> Tuple[List[SamplerOutput], bool]:
+        """Run the model forward pass to generate sample_len future tokens.
+        Returns the list of sampler output, one per layer, along with indicator
+        of whether torch tensor in sampler output need to be transposed in
+        latter sampler_output_to_torch logic.
+
+        For medusa worker, this indicator shall be False.
+        """
+        self._raise_if_unsupported(execute_model_req)
+
+        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
+        seq_lens, query_lens = self._prepare_input_tensors(
+            seq_group_metadata_list)
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list, seq_lens, query_lens, self.device,
+            self.model_runner.pin_memory)
+
+        model_outputs = self.model_runner.model.generate_proposals(
+            previous_hidden_states=execute_model_req.previous_hidden_states.
+            hidden_states,
+            sampling_metadata=sampling_metadata)
+
+        return model_outputs, False
+
+    def _prepare_input_tensors(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Tuple[List[int], List[int]]:
+        if not seq_group_metadata_list:
+            return [], []
+
+        seq_lens: List[int] = []
+        query_lens: List[int] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            is_prompt = seq_group_metadata.is_prompt
+
+            for seq_data in seq_group_metadata.seq_data.values():
+                seq_data_len = seq_data.get_len()
+                if is_prompt:
+                    context_len = seq_data.get_num_computed_tokens()
+                    seq_len = min(
+                        seq_data_len,
+                        context_len + seq_group_metadata.token_chunk_size)
+                    seq_lens.append(seq_len)
+                    query_lens.append(seq_len - context_len)
+                else:
+                    seq_lens.append(seq_data_len)
+                    query_lens.append(1)
+
+        return seq_lens, query_lens
+
+    def get_spec_proposals(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+
+        return self._proposer.get_spec_proposals(execute_model_req)
+
+    def _raise_if_unsupported(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> None:
+        """MedusaWorker does not yet implement support for cache swap
+        operations or beam search.
+        """
+        if any([
+                execute_model_req.blocks_to_swap_in,
+                execute_model_req.blocks_to_swap_out,
+                execute_model_req.blocks_to_copy
+        ]):
+            raise NotImplementedError(
+                "MedusaWorker does not support cache operations")
+
+        if any(
+                len(seq_group_metadata.seq_data.keys()) != 1
+                for seq_group_metadata in
+                execute_model_req.seq_group_metadata_list):
+            raise NotImplementedError(
+                "MedusaWorker does not support beam search.")
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 43ce987de..60a7dab68 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -18,6 +18,7 @@ from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
+from vllm.spec_decode.medusa_worker import MedusaWorker
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
@@ -129,6 +130,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
                     "model_config"].hf_config.model_type == "mlp_speculator":
                 disable_bonus_tokens = False
                 proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
+            elif draft_worker_kwargs[
+                    "model_config"].hf_config.model_type == "medusa":
+                disable_bonus_tokens = False
+                proposer_worker = MedusaWorker(**draft_worker_kwargs)
             else:
                 if draft_tp == 1:
                     draft_worker_kwargs[
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 5e2fe116d..652505a89 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,8 +6,9 @@ from transformers import GenerationConfig, PretrainedConfig
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             JAISConfig, MLPSpeculatorConfig,
-                                             MPTConfig, RWConfig)
+                                             JAISConfig, MedusaConfig,
+                                             MLPSpeculatorConfig, MPTConfig,
+                                             RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -24,6 +25,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
+    "medusa": MedusaConfig,
 }
 
 for name, cls in _CONFIG_REGISTRY.items():
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d8170858c..51de11ca3 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -5,6 +5,7 @@ from vllm.transformers_utils.configs.dbrx import DbrxConfig
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
@@ -14,5 +15,6 @@ __all__ = [
     "MPTConfig",
     "RWConfig",
     "JAISConfig",
+    "MedusaConfig",
     "MLPSpeculatorConfig",
 ]
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
new file mode 100644
index 000000000..d71a08343
--- /dev/null
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -0,0 +1,60 @@
+import os
+from typing import Optional, Union
+
+from transformers import PretrainedConfig
+
+
+class MedusaConfig(PretrainedConfig):
+    model_type = "medusa"
+
+    def __init__(self,
+                 hidden_size: int = 4096,
+                 vocab_size: int = 32001,
+                 num_heads: int = 5,
+                 num_hidden_layers: int = 1,
+                 max_paths: int = 64,
+                 topk: int = 10,
+                 truncated_vocab_size: Optional[int] = None,
+                 **kwargs):
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.max_paths = max_paths
+        self.topk = topk
+        self.max_seq_len = int(2**20)
+        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
+            else truncated_vocab_size
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["MedusaModel"]
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "MedusaConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+        for k in list(config_dict.keys()):
+            if 'num' in k:
+                if 'heads' in k:
+                    config_dict["num_heads"] = config_dict.pop(k)
+                elif 'layers' in k:
+                    config_dict["num_hidden_layers"] = config_dict.pop(k)
+        return cls.from_dict(config_dict, **kwargs)
+
+    @property
+    def num_attention_heads(self):
+        return 0
+
+    @property
+    def num_lookahead_tokens(self):
+        return self.num_heads
+
+    @num_lookahead_tokens.setter
+    def num_lookahead_tokens(self, num_lookahead_tokens: int):
+        self.num_heads = num_lookahead_tokens
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 857cd86be..56d8587f8 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -78,8 +78,9 @@ class Worker(LocalOrDistributedWorkerBase):
         speculative_args = {} if speculative_config is None \
             or (speculative_config.draft_model_config.model ==
                 model_config.model) \
-              or (speculative_config.draft_model_config.hf_config.model_type !=
-                  "mlp_speculator") else {"return_hidden_states": True}
+            or (speculative_config.draft_model_config.hf_config.model_type
+                not in ["medusa", "mlp_speculator"]) \
+                    else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
         if model_runner_cls is not None:
-- 
GitLab


From da78caecfa7f6137efc3e08388f4db102650ac45 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 9 Jul 2024 18:49:11 -0700
Subject: [PATCH 298/376] [core][distributed] zmq fallback for broadcasting
 large objects (#6183)

[core][distributed] add zmq fallback for broadcasting large objects (#6183)
---
 requirements-common.txt                       |   1 +
 tests/distributed/test_same_node.py           |   5 +-
 tests/distributed/test_shm_broadcast.py       |  17 +-
 .../device_communicators/custom_all_reduce.py |   4 +-
 .../device_communicators/shm_broadcast.py     | 269 +++++++++++++++---
 vllm/distributed/parallel_state.py            |  58 ++--
 6 files changed, 274 insertions(+), 80 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 765568b03..e874c4af4 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -21,3 +21,4 @@ lm-format-enforcer == 0.10.1
 outlines >= 0.0.43 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 4880bab79..2d886eb56 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -2,10 +2,11 @@ import os
 
 import torch
 
-from vllm.distributed.parallel_state import is_in_the_same_node
+from vllm.distributed.parallel_state import in_the_same_node_as
 
 torch.distributed.init_process_group(backend="gloo")
-test_result = is_in_the_same_node(torch.distributed.group.WORLD)
+test_result = all(
+    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
 
 expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
 assert test_result == expected, f"Expected {expected}, got {test_result}"
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 2c2466f81..2761b7f6c 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -6,8 +6,7 @@ from typing import List
 import numpy as np
 import torch.distributed as dist
 
-from vllm.distributed.device_communicators.shm_broadcast import (
-    ShmRingBuffer, ShmRingBufferIO)
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.utils import update_environment_variables
 
 
@@ -56,8 +55,8 @@ def worker_fn_wrapper(fn):
 @worker_fn_wrapper
 def worker_fn():
     writer_rank = 2
-    broadcaster = ShmRingBufferIO.create_from_process_group(
-        dist.group.WORLD, 1024 * 1024, 2, writer_rank)
+    broadcaster = MessageQueue.create_from_process_group(
+        dist.group.WORLD, 40 * 1024, 2, writer_rank)
     if dist.get_rank() == writer_rank:
         seed = random.randint(0, 1000)
         dist.broadcast_object_list([seed], writer_rank)
@@ -87,13 +86,3 @@ def worker_fn():
 
 def test_shm_broadcast():
     distributed_run(worker_fn, 4)
-
-
-def test_singe_process():
-    buffer = ShmRingBuffer(1, 1024, 4)
-    reader = ShmRingBufferIO(buffer, reader_rank=0)
-    writer = ShmRingBufferIO(buffer, reader_rank=-1)
-    writer.enqueue([0])
-    writer.enqueue([1])
-    assert reader.dequeue() == [0]
-    assert reader.dequeue() == [1]
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index a303d0bd2..a4f30808d 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -9,7 +9,7 @@ import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
-from vllm.distributed.parallel_state import is_in_the_same_node
+from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
 from vllm.utils import cuda_device_count_stateless, is_full_nvlink
 
@@ -64,7 +64,7 @@ class CustomAllreduce:
         assert dist.get_backend(group) != dist.Backend.NCCL, (
             "CustomAllreduce should be attached to a non-NCCL group.")
 
-        if not is_in_the_same_node(group):
+        if not all(in_the_same_node_as(group, source_rank=0)):
             # No need to initialize custom allreduce for multi-node case.
             logger.warning(
                 "Custom allreduce is disabled because this process group"
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index bea205882..db0064951 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,16 +1,19 @@
 import pickle
 import time
 from contextlib import contextmanager
+from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import Optional
+from typing import List, Optional
 from unittest.mock import patch
 
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
+from zmq import PUB, REP, REQ, SUB, SUBSCRIBE, Context  # type: ignore
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.utils import get_ip, get_open_port
 
 VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 
@@ -135,18 +138,183 @@ class ShmRingBuffer:
             yield buf
 
 
-class ShmRingBufferIO:
+@dataclass
+class Handle:
+    connect_ip: str
+    local_reader_ranks: List[int] = field(default_factory=list)
 
-    def __init__(self, buffer: ShmRingBuffer, reader_rank: int):
-        self.buffer = buffer
-        self.reader_rank = reader_rank
-        self._is_writer = self.reader_rank == -1
-        self._is_reader = not self._is_writer
-        if self._is_reader:
-            assert 0 <= self.reader_rank < buffer.n_reader, \
-                (f"Invalid reader rank {self.reader_rank} for buffer"
-                f" created with {buffer.n_reader} readers")
-        self.current_idx = 0
+    buffer: Optional[ShmRingBuffer] = None
+    local_subscribe_port: Optional[int] = None
+    local_sync_port: Optional[int] = None
+    remote_subscribe_port: Optional[int] = None
+    remote_sync_port: Optional[int] = None
+
+
+class MessageQueue:
+
+    def __init__(
+        self,
+        n_reader,  # number of all readers
+        n_local_reader,  # number of local readers through shared memory
+        local_reader_ranks: Optional[List[int]] = None,
+        max_chunk_bytes: int = 1024 * 1024 * 10,
+        max_chunks: int = 10,
+        connect_ip: Optional[str] = None,
+    ):
+        if local_reader_ranks is None:
+            local_reader_ranks = list(range(n_local_reader))
+        else:
+            assert len(local_reader_ranks) == n_local_reader
+        self.n_local_reader = n_local_reader
+        n_remote_reader = n_reader - n_local_reader
+        self.n_remote_reader = n_remote_reader
+
+        if connect_ip is None:
+            connect_ip = get_ip()
+
+        context = Context()
+
+        if n_local_reader > 0:
+            # for local readers, we will:
+            # 1. create a shared memory ring buffer to communicate small data
+            # 2. create a publish-subscribe socket to communicate large data
+            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes,
+                                        max_chunks)
+
+            self.local_socket = context.socket(PUB)
+            local_subscribe_port = get_open_port()
+            self.local_socket.bind(f"tcp://*:{local_subscribe_port}")
+
+            self.local_sync_socket = context.socket(REP)
+            local_sync_port = get_open_port()
+            self.local_sync_socket.bind(f"tcp://*:{local_sync_port}")
+            self.current_idx = 0
+
+        else:
+            self.buffer = None  # type: ignore
+            local_subscribe_port = None
+            local_sync_port = None
+            self.local_socket = None
+            self.local_sync_socket = None
+            self.current_idx = -1
+
+        if n_remote_reader > 0:
+            # for remote readers, we will:
+            # create a publish-subscribe socket to communicate large data
+            self.remote_socket = context.socket(PUB)
+            remote_subscribe_port = get_open_port()
+            self.remote_socket.bind(f"tcp://*:{remote_subscribe_port}")
+
+            self.remote_sync_socket = context.socket(REP)
+            remote_sync_port = get_open_port()
+            self.remote_sync_socket.bind(f"tcp://*:{remote_sync_port}")
+        else:
+            remote_subscribe_port = None
+            remote_sync_port = None
+            self.remote_socket = None
+            self.remote_sync_socket = None
+
+        self._is_writer = True
+        self._is_local_reader = False
+        self.local_reader_rank = -1
+        # rank does not matter for remote readers
+        self._is_remote_reader = False
+
+        self.handle = Handle(
+            connect_ip=connect_ip,
+            local_reader_ranks=local_reader_ranks,
+            buffer=self.buffer,
+            local_subscribe_port=local_subscribe_port,
+            local_sync_port=local_sync_port,
+            remote_subscribe_port=remote_subscribe_port,
+            remote_sync_port=remote_sync_port,
+        )
+
+    def export_handle(self) -> Handle:
+        return self.handle
+
+    @staticmethod
+    def create_from_handle(handle: Handle, rank) -> "MessageQueue":
+        self = MessageQueue.__new__(MessageQueue)
+        self.handle = handle
+        self._is_writer = False
+
+        context = Context()
+
+        if rank in handle.local_reader_ranks:
+            assert handle.buffer is not None
+            self.buffer = handle.buffer
+            self.current_idx = 0
+            self.local_reader_rank = handle.local_reader_ranks.index(rank)
+            self._is_local_reader = True
+            self._is_remote_reader = False
+
+            self.local_socket = context.socket(SUB)
+            self.local_socket.setsockopt_string(SUBSCRIBE, "")
+            self.local_socket.connect(
+                f"tcp://{handle.connect_ip}:{handle.local_subscribe_port}")
+
+            self.local_sync_socket = context.socket(REQ)
+            self.local_sync_socket.connect(
+                f"tcp://{handle.connect_ip}:{handle.local_sync_port}")
+
+            self.remote_socket = None
+            self.remote_sync_socket = None
+        else:
+            self.buffer = None  # type: ignore
+            self.current_idx = -1
+            self.local_reader_rank = -1
+            self._is_local_reader = False
+            self._is_remote_reader = True
+
+            self.local_socket = None
+            self.local_sync_socket = None
+
+            self.remote_socket = context.socket(SUB)
+            self.remote_socket.setsockopt_string(SUBSCRIBE, "")
+            self.remote_socket.connect(
+                f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}")
+
+            self.remote_sync_socket = context.socket(REQ)
+            self.remote_sync_socket.connect(
+                f"tcp://{handle.connect_ip}:{handle.remote_sync_port}")
+
+        return self
+
+    def wait_until_ready(self):
+        """This is a collective operation. All processes (including the
+        readers and the writer) should call this function.
+        """
+        if self._is_writer:
+            # wait for all readers to connect
+
+            # local readers
+            for i in range(self.n_local_reader):
+                recv = self.local_sync_socket.recv()
+                assert recv == b"READY"
+                self.local_sync_socket.send(b"READY")
+            if self.n_local_reader > 0:
+                self.local_socket.send(b"READY")
+
+            # remote readers
+            for i in range(self.n_remote_reader):
+                recv = self.remote_sync_socket.recv()
+                assert recv == b"READY"
+                self.remote_sync_socket.send(b"READY")
+            if self.n_remote_reader > 0:
+                self.remote_socket.send(b"READY")
+        elif self._is_local_reader:
+            self.local_sync_socket.send(b"READY")
+            recv = self.local_sync_socket.recv()
+            assert recv == b"READY"
+            recv = self.local_socket.recv()
+            assert recv == b"READY"
+        elif self._is_remote_reader:
+            self.remote_sync_socket.send(b"READY")
+            recv = self.remote_sync_socket.recv()
+            assert recv == b"READY"
+            recv = self.remote_socket.recv()
+            assert recv == b"READY"
 
     @contextmanager
     def acquire_write(self):
@@ -201,12 +369,12 @@ class ShmRingBufferIO:
 
     @contextmanager
     def acquire_read(self):
-        assert self._is_reader, "Only readers can acquire read"
+        assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
         while True:
             with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
-                read_flag = metadata_buffer[self.reader_rank + 1]
+                read_flag = metadata_buffer[self.local_reader_rank + 1]
                 written_flag = metadata_buffer[0]
                 if not written_flag or read_flag:
                     # this block is either
@@ -236,7 +404,7 @@ class ShmRingBufferIO:
 
                 # caller has read from the buffer
                 # set the read flag
-                metadata_buffer[self.reader_rank + 1] = 1
+                metadata_buffer[self.local_reader_rank + 1] = 1
                 self.current_idx = (self.current_idx +
                                     1) % self.buffer.max_chunks
                 break
@@ -244,21 +412,36 @@ class ShmRingBufferIO:
     def enqueue(self, obj):
         assert self._is_writer, "Only writers can enqueue"
         serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
-        if len(serialized_obj) > self.buffer.max_chunk_bytes:
-            raise RuntimeError(
-                f"{len(serialized_obj)=} larger than the allowed value "
-                f"{self.buffer.max_chunk_bytes},"
-                "Please increase the max_chunk_bytes parameter.")
-        with self.acquire_write() as buf:
-            buf[:len(serialized_obj)] = serialized_obj
+        if self.n_local_reader > 0:
+            if len(serialized_obj) >= self.buffer.max_chunk_bytes:
+                with self.acquire_write() as buf:
+                    buf[0] = 1  # overflow
+                self.local_socket.send(serialized_obj)
+            else:
+                with self.acquire_write() as buf:
+                    buf[0] = 0  # not overflow
+                    buf[1:len(serialized_obj) + 1] = serialized_obj
+        if self.n_remote_reader > 0:
+            self.remote_socket.send(serialized_obj)
 
     def dequeue(self):
-        assert self._is_reader, "Only readers can dequeue"
-        with self.acquire_read() as buf:
-            # no need to know the size of serialized object
-            # pickle format itself contains the size information internally
-            # see https://docs.python.org/3/library/pickle.html
-            obj = pickle.loads(buf)
+        if self._is_local_reader:
+            overflow = False
+            with self.acquire_read() as buf:
+                overflow = buf[0] == 1
+                if not overflow:
+                    # no need to know the size of serialized object
+                    # pickle format contains the size information internally
+                    # see https://docs.python.org/3/library/pickle.html
+                    obj = pickle.loads(buf[1:])
+            if overflow:
+                recv = self.local_socket.recv()
+                obj = pickle.loads(recv)
+        elif self._is_remote_reader:
+            recv = self.remote_socket.recv()
+            obj = pickle.loads(recv)
+        else:
+            raise RuntimeError("Only readers can dequeue")
         return obj
 
     def broadcast_object(self, obj=None):
@@ -272,24 +455,36 @@ class ShmRingBufferIO:
     def create_from_process_group(pg: ProcessGroup,
                                   max_chunk_bytes,
                                   max_chunks,
-                                  writer_rank=0) -> "ShmRingBufferIO":
+                                  writer_rank=0) -> "MessageQueue":
         group_rank = dist.get_rank(pg)
         group_world_size = dist.get_world_size(pg)
-        ranks_inside_group = list(range(group_world_size))
         global_ranks = dist.get_process_group_ranks(pg)
+
+        from vllm.distributed.parallel_state import in_the_same_node_as
+        status = in_the_same_node_as(pg, source_rank=writer_rank)
+        same_node_ranks = [i for i, s in enumerate(status) if s]
         n_reader = group_world_size - 1
-        buffer: ShmRingBuffer
+        n_local_reader = len(same_node_ranks) - 1
+        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+        buffer_io: MessageQueue
         if group_rank == writer_rank:
-            buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks)
-            dist.broadcast_object_list([buffer],
+            buffer_io = MessageQueue(
+                n_reader=n_reader,
+                n_local_reader=n_local_reader,
+                local_reader_ranks=local_reader_ranks,
+                max_chunk_bytes=max_chunk_bytes,
+                max_chunks=max_chunks,
+            )
+            handle = buffer_io.export_handle()
+            dist.broadcast_object_list([handle],
                                        src=global_ranks[writer_rank],
                                        group=pg)
-            return ShmRingBufferIO(buffer, -1)
         else:
             recv = [None]
             dist.broadcast_object_list(recv,
                                        src=global_ranks[writer_rank],
                                        group=pg)
-            buffer = recv[0]  # type: ignore
-            rest_ranks = [r for r in ranks_inside_group if r != writer_rank]
-            return ShmRingBufferIO(buffer, rest_ranks.index(group_rank))
+            handle = recv[0]  # type: ignore
+            buffer_io = MessageQueue.create_from_handle(handle, group_rank)
+        buffer_io.wait_until_ready()
+        return buffer_io
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 66ffe6e8a..128096c88 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -124,7 +124,7 @@ class GroupCoordinator:
     # communicators are only created for world size > 1
     pynccl_comm: Optional[Any]  # PyNccl communicator
     ca_comm: Optional[Any]  # Custom allreduce communicator
-    shm_broadcaster: Optional[Any]  # shared memory broadcaster
+    mq_broadcaster: Optional[Any]  # shared memory broadcaster
 
     def __init__(
         self,
@@ -133,6 +133,7 @@ class GroupCoordinator:
         torch_distributed_backend: Union[str, Backend],
         use_pynccl: bool,
         use_custom_allreduce: bool,
+        use_message_queue_broadcaster: bool = False,
     ):
 
         self.rank = torch.distributed.get_rank()
@@ -190,10 +191,10 @@ class GroupCoordinator:
             self.ca_comm = None
 
         from vllm.distributed.device_communicators.shm_broadcast import (
-            ShmRingBufferIO)
-        self.shm_broadcaster: Optional[ShmRingBufferIO] = None
-        if self.world_size > 1 and is_in_the_same_node(self.cpu_group):
-            self.shm_broadcaster = ShmRingBufferIO.create_from_process_group(
+            MessageQueue)
+        self.mq_broadcaster: Optional[MessageQueue] = None
+        if use_message_queue_broadcaster and self.world_size > 1:
+            self.mq_broadcaster = MessageQueue.create_from_process_group(
                 self.cpu_group, 1 << 22, 6)
 
     @property
@@ -377,9 +378,9 @@ class GroupCoordinator:
         # Bypass the function if we are using only 1 GPU.
         if self.world_size == 1:
             return obj
-        if self.shm_broadcaster is not None:
-            assert src == 0, "Shared memory broadcaster only supports src=0"
-            return self.shm_broadcaster.broadcast_object(obj)
+        if self.mq_broadcaster is not None:
+            assert src == 0, "Message queue broadcaster only supports src=0"
+            return self.mq_broadcaster.broadcast_object(obj)
         if self.rank_in_group == src:
             torch.distributed.broadcast_object_list([obj],
                                                     src=self.ranks[src],
@@ -696,8 +697,8 @@ class GroupCoordinator:
             self.pynccl_comm = None
         if self.ca_comm is not None:
             self.ca_comm = None
-        if self.shm_broadcaster is not None:
-            self.shm_broadcaster = None
+        if self.mq_broadcaster is not None:
+            self.mq_broadcaster = None
 
 
 _WORLD: Optional[GroupCoordinator] = None
@@ -720,10 +721,12 @@ def init_world_group(ranks: List[int], local_rank: int,
 
 
 def init_model_parallel_group(
-        group_ranks: List[List[int]],
-        local_rank: int,
-        backend: str,
-        use_custom_allreduce: Optional[bool] = None) -> GroupCoordinator:
+    group_ranks: List[List[int]],
+    local_rank: int,
+    backend: str,
+    use_custom_allreduce: Optional[bool] = None,
+    use_message_queue_broadcaster: bool = False,
+) -> GroupCoordinator:
     if use_custom_allreduce is None:
         use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
     return GroupCoordinator(
@@ -732,6 +735,7 @@ def init_model_parallel_group(
         torch_distributed_backend=backend,
         use_pynccl=True,
         use_custom_allreduce=use_custom_allreduce,
+        use_message_queue_broadcaster=use_message_queue_broadcaster,
     )
 
 
@@ -880,8 +884,12 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
+
+    # message queue broadcaster is only used in tensor model parallel group
     _TP = init_model_parallel_group(group_ranks,
-                                    get_world_group().local_rank, backend)
+                                    get_world_group().local_rank,
+                                    backend,
+                                    use_message_queue_broadcaster=True)
 
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = (world_size //
@@ -993,15 +1001,15 @@ def destroy_distributed_environment():
         torch.distributed.destroy_process_group()
 
 
-def is_in_the_same_node(pg: ProcessGroup):
+def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
     """
-    This is a collective operation that checks if all processes in the group
-    are in the same node. It tests if all processes are attached to the same
+    This is a collective operation that returns if each rank is in the same node
+    as the source rank. It tests if processes are attached to the same
     memory system (shared access to shared memory).
     """
     assert torch.distributed.get_backend(
         pg) != torch.distributed.Backend.NCCL, (
-            "is_in_the_same_node should be tested with a non-NCCL group.")
+            "in_the_same_node_as should be tested with a non-NCCL group.")
     # local rank inside the group
     rank = torch.distributed.get_rank(group=pg)
     world_size = torch.distributed.get_world_size(group=pg)
@@ -1017,19 +1025,19 @@ def is_in_the_same_node(pg: ProcessGroup):
 
     try:
         with contextlib.suppress(OSError):
-            if rank == 0:
+            if rank == source_rank:
                 # create a shared memory segment
                 shm = shared_memory.SharedMemory(create=True, size=128)
                 shm.buf[:len(magic_message)] = magic_message
                 torch.distributed.broadcast_object_list([shm.name],
-                                                        src=ranks[0],
+                                                        src=ranks[source_rank],
                                                         group=pg)
-                is_in_the_same_node[0] = 1
+                is_in_the_same_node[rank] = 1
             else:
                 # try to open the shared memory segment
                 recv = [None]
                 torch.distributed.broadcast_object_list(recv,
-                                                        src=ranks[0],
+                                                        src=ranks[source_rank],
                                                         group=pg)
                 name = recv[0]
                 # fix to https://stackoverflow.com/q/62748654/9191338
@@ -1050,8 +1058,8 @@ def is_in_the_same_node(pg: ProcessGroup):
 
     # clean up the shared memory segment
     with contextlib.suppress(OSError):
-        if rank == 0 and shm:
+        if rank == source_rank and shm:
             shm.unlink()
     torch.distributed.all_reduce(is_in_the_same_node, group=pg)
 
-    return is_in_the_same_node.sum().item() == world_size
+    return [x == 1 for x in is_in_the_same_node.tolist()]
-- 
GitLab


From 5ed3505d827658fe4f71f30fecf93a66baabfe26 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 9 Jul 2024 19:30:56 -0700
Subject: [PATCH 299/376] [Bugfix][TPU] Add prompt adapter methods to
 TPUExecutor (#6279)

---
 vllm/executor/tpu_executor.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 6627ee698..d906a6cc3 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -81,8 +81,7 @@ class TPUExecutor(ExecutorBase):
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
+        underlying worker."""
         return self.driver_worker.determine_num_available_blocks()
 
     def execute_model(
@@ -93,16 +92,36 @@ class TPUExecutor(ExecutorBase):
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
 
     def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
 
     def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
 
     def list_loras(self) -> Set[int]:
-        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
+
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
 
     def check_health(self) -> None:
         # TPUExecutor will always be healthy as long as it's running.
-- 
GitLab


From 8a924d2248dedb620eb9a32ca5c9f97ab525aaf5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 10 Jul 2024 14:55:34 +0800
Subject: [PATCH 300/376] [Doc] Guide for adding multi-modal plugins (#6205)

---
 docs/source/_templates/sections/header.html   |  1 +
 .../multimodal/adding_multimodal_plugin.rst   | 17 +++++++++++++
 .../dev/multimodal/multimodal_index.rst       | 24 ++++++++++++-------
 vllm/multimodal/__init__.py                   |  5 ++--
 vllm/multimodal/base.py                       | 21 +++++++++-------
 vllm/multimodal/image.py                      |  1 +
 vllm/multimodal/registry.py                   | 18 ++++++++++----
 7 files changed, 64 insertions(+), 23 deletions(-)
 create mode 100644 docs/source/dev/multimodal/adding_multimodal_plugin.rst

diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html
index cd5c4053e..7174431b1 100644
--- a/docs/source/_templates/sections/header.html
+++ b/docs/source/_templates/sections/header.html
@@ -5,6 +5,7 @@
     justify-content: center;
     align-items: center;
     font-size: 16px;
+    padding: 0 6px 0 6px;
   }
   .notification-bar p {
     margin: 0;
diff --git a/docs/source/dev/multimodal/adding_multimodal_plugin.rst b/docs/source/dev/multimodal/adding_multimodal_plugin.rst
new file mode 100644
index 000000000..b726138f8
--- /dev/null
+++ b/docs/source/dev/multimodal/adding_multimodal_plugin.rst
@@ -0,0 +1,17 @@
+.. _adding_multimodal_plugin:
+
+Adding a Multimodal Plugin
+==========================
+
+This document teaches you how to add a new modality to vLLM.
+
+Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
+
+The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
+
+.. note::
+  This article is a work in progress.
+
+..
+  TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 39daf30a3..6713dcf08 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -7,17 +7,21 @@ Multi-Modality
     
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 
-Multi-modal input can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
+Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
 via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
 
-.. note::
-   ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through 
-   the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
+by following :ref:`this guide <adding_multimodal_plugin>`.
 
-To implement a new multi-modal model in vLLM, please follow :ref:`this guide <enabling_multimodal_inputs>`.
+Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
 
-..
-  TODO: Add more instructions on how to add new plugins once embeddings is in.
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   adding_multimodal_plugin
 
 Module Contents
 +++++++++++++++
@@ -36,10 +40,14 @@ Registry
 Base Classes
 ------------
 
-.. autoclass:: vllm.multimodal.MultiModalDataDict
+.. autodata:: vllm.multimodal.BatchedTensors
+
+.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
     :members:
     :show-inheritance:
 
+.. autodata:: vllm.multimodal.MultiModalDataDict
+
 .. autoclass:: vllm.multimodal.MultiModalInputs
     :members:
     :show-inheritance:
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index b6d930659..503dceab5 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,5 @@
-from .base import (BatchedTensors, MultiModalDataDict, MultiModalInputs,
-                   MultiModalPlugin)
+from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
+                   MultiModalInputs, MultiModalPlugin)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -13,6 +13,7 @@ See also:
 
 __all__ = [
     "BatchedTensors",
+    "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalInputs",
     "MultiModalPlugin",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 0e31816a8..3ebc25c59 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -43,9 +43,6 @@ class MultiModalInputs(_MultiModalInputsBase):
         *,
         device: torch.types.Device,
     ) -> BatchedTensors:
-        # Avoid initializing CUDA too early
-        import torch
-
         unbatched_shape = tensors[0].shape[1:]
 
         for tensor in tensors:
@@ -84,16 +81,21 @@ class MultiModalInputs(_MultiModalInputsBase):
 
 
 class MultiModalDataBuiltins(TypedDict, total=False):
+    """Modality types that are predefined by vLLM."""
+
     image: Image.Image
+    """The input image."""
 
 
 MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
 """
 A dictionary containing an item for each modality type to input.
 
-The data belonging to each modality is converted into keyword arguments 
-to the model by the corresponding mapper. By default, the mapper of 
-the corresponding plugin with the same modality key is applied.
+Note:
+    This dictionary also accepts modality keys defined outside
+    :class:`MultiModalDataBuiltins` as long as a customized plugin is registered
+    through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+    Read more on that :ref:`here <adding_multimodal_plugin>`.
 """
 
 MultiModalInputMapper = Callable[[InputContext, object], MultiModalInputs]
@@ -123,6 +125,9 @@ class MultiModalPlugin(ABC):
     process the same data differently). This registry is in turn used by
     :class:`~MultiModalRegistry` which acts at a higher level
     (i.e., the modality of the data).
+
+    See also:
+        :ref:`adding_multimodal_plugin`
     """
 
     def __init__(self) -> None:
@@ -183,8 +188,8 @@ class MultiModalPlugin(ABC):
     def map_input(self, model_config: ModelConfig,
                   data: object) -> MultiModalInputs:
         """
-        Apply an input mapper to a data passed
-        to the model, transforming the data into a dictionary of model inputs.
+        Transform the data into a dictionary of model inputs using the
+        input mapper registered for that model.
 
         The model is identified by ``model_config``.
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index b6c735123..3b37ce914 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -100,6 +100,7 @@ def repeat_and_pad_image_tokens(
 
 
 class ImagePlugin(MultiModalPlugin):
+    """Plugin for image data."""
 
     def get_data_key(self) -> str:
         return "image"
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index e0716bbf1..d8e1b6817 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -15,10 +15,8 @@ logger = init_logger(__name__)
 
 class MultiModalRegistry:
     """
-    A registry to dispatch data processing
-    according to its modality and the target model.
-
-    The registry handles both external and internal data input.
+    A registry that dispatches data processing to the
+    :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
     """
 
     DEFAULT_PLUGINS = (ImagePlugin(), )
@@ -30,6 +28,12 @@ class MultiModalRegistry:
         self._plugins = {p.get_data_key(): p for p in plugins}
 
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
+        """
+        Register a multi-modal plugin so it can be recognized by vLLM.
+
+        See also:
+            :ref:`adding_multimodal_plugin`
+        """
         data_type_key = plugin.get_data_key()
 
         if data_type_key in self._plugins:
@@ -75,7 +79,11 @@ class MultiModalRegistry:
                   data: MultiModalDataDict) -> MultiModalInputs:
         """
         Apply an input mapper to the data passed to the model.
-        
+
+        The data belonging to each modality is passed to the corresponding
+        plugin which in turn converts the data into into keyword arguments
+        via the input mapper registered for that model.
+
         See :meth:`MultiModalPlugin.map_input` for more details.
         """
         merged_dict: Dict[str, torch.Tensor] = {}
-- 
GitLab


From e72ae80b06405ea92b703c8979f046d68e970c94 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 10 Jul 2024 06:03:16 -0700
Subject: [PATCH 301/376] [Bugfix] Support 2D input shape in MoE layer (#6287)

---
 vllm/model_executor/models/mixtral.py   | 5 +++--
 vllm/model_executor/models/qwen2_moe.py | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 7f5e3b969..e5bd58a9e 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -88,12 +88,13 @@ class MixtralMoE(nn.Module):
                                 tp_size=tp_size)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_size = hidden_states.shape
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, router_logits)
-        return final_hidden_states.view(num_tokens, hidden_size)
+        return final_hidden_states.view(orig_shape)
 
 
 class MixtralAttention(nn.Module):
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index ccaa6f208..7b18b5e04 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -126,7 +126,9 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
                                                   bias=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
         hidden_states = hidden_states.view(-1, hidden_dim)
         shared_output = None
         if self.shared_expert is not None:
@@ -145,7 +147,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
-        return final_hidden_states.view(num_tokens, hidden_dim)
+        return final_hidden_states.view(orig_shape)
 
 
 class Qwen2MoeAttention(nn.Module):
-- 
GitLab


From c38eba304674fdf9da4d881e46f103440e22a153 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 10 Jul 2024 15:04:07 +0200
Subject: [PATCH 302/376] [Bugfix] MLPSpeculator: Use ParallelLMHead in
 tie_weights=False case. (#6303)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/models/mlp_speculator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 97f7ec742..d3aec06a9 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -110,7 +110,7 @@ class MLPSpeculator(nn.Module):
             ])
 
             self.head = nn.ModuleList([
-                nn.Linear(self.inner_dim, self.vocab_size, bias=False)
+                ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
                 for _ in range(self.max_speculative_tokens)
             ])
             self.ln = nn.ModuleList([
-- 
GitLab


From b422d4961a3052c5b4bcfc3747a1ad55acfe7eb8 Mon Sep 17 00:00:00 2001
From: Benjamin Muskalla <bmuskalla@github.com>
Date: Wed, 10 Jul 2024 16:15:55 +0200
Subject: [PATCH 303/376] [CI/Build] Enable mypy typing for remaining folders
 (#6268)

---
 .github/workflows/mypy.yaml | 18 ++++++++++--------
 format.sh                   | 18 +++++++++---------
 vllm/platforms/cuda.py      |  5 ++---
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 62f0dbcd9..5780f09a6 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,20 +32,22 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
+        mypy tests --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
         mypy vllm/attention --config-file pyproject.toml
         mypy vllm/core --config-file pyproject.toml
         mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/inputs --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/model_executor  --config-file pyproject.toml
         mypy vllm/multimodal --config-file pyproject.toml
-        mypy vllm/usage --config-file pyproject.toml
-        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/platforms --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
         mypy vllm/transformers_utils --config-file pyproject.toml
-        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
-        mypy vllm/spec_decode --config-file pyproject.toml
-        mypy vllm/model_executor  --config-file pyproject.toml
-        mypy vllm/lora --config-file pyproject.toml
-        mypy vllm/logging --config-file pyproject.toml
-        mypy tests --config-file pyproject.toml
 
diff --git a/format.sh b/format.sh
index 5edc868f9..5ad6d6f29 100755
--- a/format.sh
+++ b/format.sh
@@ -96,23 +96,23 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
+mypy tests --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
 mypy vllm/attention --config-file pyproject.toml
 mypy vllm/core --config-file pyproject.toml
 mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/engine  --config-file pyproject.toml
 mypy vllm/entrypoints --config-file pyproject.toml
 mypy vllm/executor --config-file pyproject.toml
+mypy vllm/logging --config-file pyproject.toml
+mypy vllm/lora --config-file pyproject.toml
+mypy vllm/model_executor  --config-file pyproject.toml
 mypy vllm/multimodal --config-file pyproject.toml
-mypy vllm/usage --config-file pyproject.toml
-mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/prompt_adapter --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml
-mypy vllm/engine  --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
 mypy vllm/worker --config-file pyproject.toml
-mypy vllm/spec_decode --config-file pyproject.toml
-mypy vllm/model_executor  --config-file pyproject.toml
-mypy vllm/lora --config-file pyproject.toml
-mypy vllm/logging --config-file pyproject.toml
-mypy vllm/prompt_adapter --config-file pyproject.toml
-mypy tests --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2d482010c..02ba22746 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -34,11 +34,10 @@ def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
 def device_id_to_physical_device_id(device_id: int) -> int:
     if "CUDA_VISIBLE_DEVICES" in os.environ:
         device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-        device_ids = [int(device_id) for device_id in device_ids]
         physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
     else:
-        physical_device_id = device_id
-    return physical_device_id
+        return device_id
 
 
 class CudaPlatform(Platform):
-- 
GitLab


From 44cc76610d0b23ce5d609867f6dae7e033dee818 Mon Sep 17 00:00:00 2001
From: "sangjune.park" <park12sj@gmail.com>
Date: Thu, 11 Jul 2024 02:03:32 +0900
Subject: [PATCH 304/376] [Bugfix] Fix OpenVINOExecutor abstractmethod error
 (#6296)

Signed-off-by: sangjune.park <sangjune.park@navercorp.com>
---
 vllm/executor/openvino_executor.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index 697d698b4..1ef37785b 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -90,6 +90,22 @@ class OpenVINOExecutor(ExecutorBase):
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+
     def check_health(self) -> None:
         # OpenVINOExecutor will always be healthy as long as
         # it's running.
-- 
GitLab


From ae151d73be479e9c0caa2fdfc30b17f073018ef3 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Wed, 10 Jul 2024 16:02:47 -0700
Subject: [PATCH 305/376] [Speculative Decoding] Enabling bonus token in
 speculative decoding for KV cache based models (#5765)

---
 tests/spec_decode/test_dynamic_spec_decode.py |  11 +-
 tests/spec_decode/test_multi_step_worker.py   | 212 +++++++++++++++++-
 tests/spec_decode/test_ngram_worker.py        |   9 +-
 tests/spec_decode/test_spec_decode_worker.py  | 147 +++++++++++-
 vllm/sequence.py                              |  18 +-
 vllm/spec_decode/interfaces.py                |   5 +-
 vllm/spec_decode/medusa_worker.py             |   8 +-
 vllm/spec_decode/mlp_speculator_worker.py     |   5 +-
 vllm/spec_decode/multi_step_worker.py         | 206 ++++++++++++++---
 vllm/spec_decode/ngram_worker.py              |  12 +-
 vllm/spec_decode/proposer_worker_base.py      |   9 +-
 .../spec_decode/smaller_tp_proposer_worker.py |  11 +-
 vllm/spec_decode/spec_decode_worker.py        |  67 ++++--
 vllm/spec_decode/top1_proposer.py             |   5 +-
 14 files changed, 645 insertions(+), 80 deletions(-)

diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index 29ed96999..1f3219593 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -70,14 +70,17 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
     if queue_size < disable_by_batch_size:
         # Should raise exception when executing the mocked draft model.
         with pytest.raises(ValueError, match=exception_secret):
-            proposer.get_spec_proposals(execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list,
-                num_lookahead_slots=k), )
+            proposer.get_spec_proposals(
+                execute_model_req=ExecuteModelRequest(
+                    seq_group_metadata_list=seq_group_metadata_list,
+                    num_lookahead_slots=k),
+                seq_ids_with_bonus_token_in_last_step=set())
     else:
         # Should not execute the draft model because spec decode is disabled
         # for all requests. Accordingly, the proposal length should be 0.
         proposals = proposer.get_spec_proposals(
             execute_model_req=ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
-                num_lookahead_slots=k), )
+                num_lookahead_slots=k),
+            seq_ids_with_bonus_token_in_last_step=set())
         assert proposals.proposal_lens.tolist() == [0] * batch_size
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 7744b2640..9832d4f26 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -118,7 +118,8 @@ def test_same_output_for_single_step():
     actual_output, _ = multi_step_worker.sampler_output(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=multi_step_seq_group),
-        sample_len=num_steps)
+        sample_len=num_steps,
+        seq_ids_with_bonus_token_in_last_step=set())
     assert len(actual_output) == num_steps
     actual_output = actual_output[0]
 
@@ -210,7 +211,8 @@ def test_same_output_for_multi_step():
     multi_step_output, _ = multi_step_worker.sampler_output(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list),
-        sample_len=num_steps)
+        sample_len=num_steps,
+        seq_ids_with_bonus_token_in_last_step=set())
 
     # Run single-step repeatedly.
     zero_kv_cache(worker.cache_engine)
@@ -277,6 +279,203 @@ def test_same_output_for_multi_step():
                                       single_step_logprobs)
 
 
+@torch.inference_mode()
+def test_multi_step_with_batch_expansion_correct_output():
+    """
+    In this test we verify that the MultiStepWorker is able to handle bonus
+    tokens correctly. The test verifies that if a sequence has a
+    bonus token then the MultiStepWorker is able to expand the batch by adding
+    new sequences corresponding to the sequences with bonus tokens. The
+    expanded batch is then used for predicting the next tokens.
+    """
+    seed = 100
+    model_name = 'JackFram/llama-68m'
+
+    block_size = 16
+    num_gpu_blocks = 2048 // block_size
+    batch_size = 128
+    multi_step_worker = create_worker(
+        MultiStepWorker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+        model_runner_cls=TP1DraftModelRunner,
+    )
+    worker = create_worker(
+        Worker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+    random.seed(seed)
+    prompts = [[0] for _ in range(batch_size)]
+    num_steps = 2
+    final_prompt_lens = [(num_steps + 1) for prompt in prompts]
+    rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
+    multi_step_worker.execute_model = patch_execute_model_with_seeds(
+        multi_step_worker, rand_seeds)
+    worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
+    # Create the test continuations
+    continuations = [[random.randint(0, 1000)] for _ in prompts]
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=continuations,
+        final_prompt_lens=final_prompt_lens)
+
+    # Run single-step twice to generate 2 tokens. This
+    # will simulate the bonus token case with the second token
+    # being the bonus token.
+    zero_kv_cache(worker.cache_engine)
+    single_step_output: List[SamplerOutput] = []
+    set_random_seed(seed)
+    for _ in range(num_steps):
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=continuations,
+            final_prompt_lens=final_prompt_lens)
+        single_step_output.extend(
+            worker.execute_model(execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list)))
+        # Append output tokens to new sequence data.
+        for i, seq_group_output in enumerate(single_step_output[-1]):
+            continuations[i].append(seq_group_output.samples[0].output_token)
+
+    # Create continuations for the MultiStepWorker. The continuations have
+    # 2 tokens in order to simulate the bonus token case.
+    multi_step_continuations = []
+    for continuation in continuations:
+        multi_step_continuations.append(continuation[:2])
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=multi_step_continuations,
+        final_prompt_lens=final_prompt_lens)
+
+    # Run multi-step and verify that the third token prediction is accurate
+    # for all sequences.
+    zero_kv_cache(multi_step_worker.cache_engine)
+    all_seq_ids = {i for i in range(batch_size)}
+    multi_step_output, _ = multi_step_worker.sampler_output(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list),
+        sample_len=1,
+        seq_ids_with_bonus_token_in_last_step=all_seq_ids)
+    for index, output in enumerate(multi_step_output[-1].outputs):
+        assert (continuations[index][-1] == output.samples[0].output_token)
+
+
+@torch.inference_mode()
+def test_multi_step_with_batch_expansion_incorrect_output():
+    """
+    Tests the MultiStepWorker's ability to handle batch expansion with bonus
+    tokens in a negative case scenario. This test provides the MultiStepWorker
+    with a batch containing sequences with bonus tokens but specifies the
+    sequence IDs with bonus tokens incorrectly. The test verifies that the
+    MultiStepWorker generates correct tokens for the sequences where the
+    sequence ID is specified correctly and incorrect tokens for those where
+    the sequence ID is specified incorrectly.
+    """
+    seed = 100
+    model_name = 'JackFram/llama-68m'
+
+    block_size = 16
+    num_gpu_blocks = 2048 // block_size
+    batch_size = 128
+    multi_step_worker = create_worker(
+        MultiStepWorker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+        model_runner_cls=TP1DraftModelRunner,
+    )
+    worker = create_worker(
+        Worker,
+        model_name,
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+    random.seed(seed)
+    prompts = [[0] for _ in range(batch_size)]
+    num_steps = 2
+    final_prompt_lens = [(num_steps + 1) for prompt in prompts]
+    rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
+    multi_step_worker.execute_model = patch_execute_model_with_seeds(
+        multi_step_worker, rand_seeds)
+    worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
+    # Create the test continuations
+    continuations = [[random.randint(0, 1000)] for _ in prompts]
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=continuations,
+        final_prompt_lens=final_prompt_lens)
+    # Run single-step twice to generate 2 tokens. This
+    # will simulate the bonus token case with the second token
+    # being the bonus token.
+    zero_kv_cache(worker.cache_engine)
+    single_step_output: List[SamplerOutput] = []
+    set_random_seed(seed)
+    for _ in range(num_steps):
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=continuations,
+            final_prompt_lens=final_prompt_lens)
+        single_step_output.extend(
+            worker.execute_model(execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list)))
+        # Append output tokens to new sequence data.
+        for i, seq_group_output in enumerate(single_step_output[-1]):
+            continuations[i].append(seq_group_output.samples[0].output_token)
+
+    # Create continuations for the MultiStepWorker. The continuations have
+    # 2 tokens in order to simulate the bonus token case.
+    multi_step_continuations = []
+    for continuation in continuations:
+        multi_step_continuations.append(continuation[:2])
+    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+        prompts,
+        num_gpu_blocks,
+        block_size,
+        continuations=multi_step_continuations,
+        final_prompt_lens=final_prompt_lens)
+
+    # Run multi-step. In this run INCORRECTLY specify that only the odd number
+    # sequences have bonus tokens. Verify that with this setting the third token
+    # prediction is accurate only for the odd numbered sequences. Also verify
+    # that the prediction might be wrong for some of the even numbered
+    # sequences.
+    zero_kv_cache(multi_step_worker.cache_engine)
+    set_random_seed(seed)
+    odd_seq_ids = {i for i in range(batch_size) if i % 2 != 0}
+    multi_step_output, _ = multi_step_worker.sampler_output(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list),
+        sample_len=1,
+        seq_ids_with_bonus_token_in_last_step=odd_seq_ids)
+    num_mismatch = 0
+    for index, output in enumerate(multi_step_output[-1].outputs):
+        if (index % 2) != 0:
+            assert (continuations[index][-1] == output.samples[0].output_token)
+        elif (continuations[index][-1] != output.samples[0].output_token):
+            num_mismatch += 1
+    # The prediction is accurate for some of the sequences even without proper
+    # handling of the bonus tokens. Hence verify that the number of sequences
+    # for which there is a mismatch is > 0.
+    assert (num_mismatch > 0)
+
+
 @torch.inference_mode()
 def test_draft_proposals_full_speculation_len():
     """Verify Top1Proposer correctly handles case where all sequences
@@ -318,7 +517,8 @@ def test_draft_proposals_full_speculation_len():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k), )
+            num_lookahead_slots=k),
+        seq_ids_with_bonus_token_in_last_step=set())
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -356,7 +556,8 @@ def test_draft_proposals_no_speculations():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k), )
+            num_lookahead_slots=k),
+        seq_ids_with_bonus_token_in_last_step=set())
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -428,7 +629,8 @@ def test_draft_proposals_mixed_k():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k), )
+            num_lookahead_slots=k),
+        seq_ids_with_bonus_token_in_last_step=set())
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index b1537884f..3995f8789 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -53,7 +53,8 @@ def test_ngram_algo_correctness_for_single_no_match():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len), )
+            num_lookahead_slots=proposal_len),
+        seq_ids_with_bonus_token_in_last_step=None)
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -121,7 +122,8 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len), )
+            num_lookahead_slots=proposal_len),
+        seq_ids_with_bonus_token_in_last_step=None)
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -193,7 +195,8 @@ def test_ngram_algo_correctness_for_batches_match_all():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len), )
+            num_lookahead_slots=proposal_len),
+        seq_ids_with_bonus_token_in_last_step=None)
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 527e7eddd..0baac3204 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,6 +1,7 @@
 import random
+from collections import defaultdict
 from types import SimpleNamespace
-from typing import Dict, List
+from typing import Dict, List, Set
 from unittest.mock import MagicMock
 
 import pytest
@@ -377,8 +378,10 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool,
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              spec_decode_sampler,
+                              metrics_collector=metrics_collector)
     worker.init_device()
 
     proposal_token_ids = torch.randint(low=0,
@@ -554,7 +557,6 @@ def test_init_device(acceptance_sampler_method: str):
 
     worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
                               metrics_collector)
-
     worker.init_device()
 
     draft_worker.init_device.assert_called_once()
@@ -645,3 +647,140 @@ def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
     assert (num_blocks * target_cache_block_size_bytes) + (
         num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks *
                                               target_cache_block_size_bytes)
+
+
+@torch.inference_mode()
+def test_populate_seq_ids_with_bonus_tokens():
+    """
+    Verify that a call to _create_output_sampler_list correctly updates
+    seq_with_bonus_token_in_last_step.
+
+    seq_with_bonus_token_in_last_step is an internal data structure in
+    SpecDecodeWorker that tracks the sequence IDs which are assigned bonus
+    tokens by the target model in their last forward pass. This state is
+    maintained only for models relying on the KV cache, such as those using
+    the MultiStepWorker.
+    """
+    batch_size = 10
+    k = 5
+    vocab_size = 10000
+    num_sequences_with_bonus_tokens = 5
+    target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    draft_worker.device = 'cuda'
+    # The sequence_ids attached to each sequence in the batch.
+    # The sequence at index i has seq_id assigned_seq_ids[i]
+    assigned_seq_ids = list(range(batch_size))
+    seq_group_metadata_list, _, _ = create_batch(batch_size,
+                                                 k,
+                                                 seq_ids=assigned_seq_ids,
+                                                 prev_output_token_len=10)
+    target_token_logprobs = torch.rand(batch_size, (k + 1),
+                                       vocab_size,
+                                       dtype=torch.float32,
+                                       device='cuda')
+    accepted_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, (k + 1)),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    expected_request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
+    for seq_group_metadata in seq_group_metadata_list:
+        for seq_id in seq_group_metadata.seq_data:
+            expected_request_id_seq_ids_mapping[
+                seq_group_metadata.request_id].add(seq_id)
+    # Generate a random sample of sequence indexes with bonus tokens
+    seq_indexes_with_bonus_tokens = random.sample(
+        range(batch_size), num_sequences_with_bonus_tokens)
+    # Create a mask that is True for indices in seq_indexes_with_bonus_tokens
+    mask = torch.ones(batch_size, dtype=torch.bool, device='cuda')
+    mask[seq_indexes_with_bonus_tokens] = False
+    # Set the last token ID to -1 for all indices not in
+    # seq_indexes_with_bonus_tokens to indicate the lack of bonus token in
+    # those indices.
+    accepted_token_ids[mask, -1:] = -1
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              mock_spec_decode_sampler("rejection_sampler"),
+                              metrics_collector=metrics_collector)
+    # Initialize _seq_with_bonus_token_in_last_step with a set of sequence IDs.
+    # This set includes all sequence IDs in the batch as well as an additional
+    # `num_extra_sequence_ids` sequence IDs. Note that the sequence IDs are in
+    # the range [0, batch_size + num_extra_sequence_ids).
+    num_extra_sequence_ids = 10
+    worker._seq_with_bonus_token_in_last_step = set(
+        range(batch_size + num_extra_sequence_ids))
+    worker._create_output_sampler_list(
+        seq_group_metadata_list=seq_group_metadata_list,
+        accepted_token_ids=accepted_token_ids,
+        target_logprobs=target_token_logprobs,
+        k=k)
+    # Verify that _seq_with_bonus_token_in_last_step contains the following:
+    # 1. Sequence IDs that were already present in
+    #    _seq_with_bonus_token_in_last_step but were not part of the current
+    #    batch are retained.
+    # 2. Of the sequence IDs present in the current batch, only those with a
+    #    bonus token are retained in _seq_with_bonus_token_in_last_step.
+    #    Sequence IDs that are present in the current batch but do not have
+    #    bonus tokens are removed from _seq_with_bonus_token_in_last_step.
+    expected_seq_ids_with_bonus_tokens = \
+        set([assigned_seq_ids[i] for i in seq_indexes_with_bonus_tokens])
+    additional_sequence_ids = \
+        set(range(batch_size, batch_size + num_extra_sequence_ids))
+    assert worker._seq_with_bonus_token_in_last_step == \
+        expected_seq_ids_with_bonus_tokens.union(additional_sequence_ids)
+    assert worker._request_id_seq_id_mapping == \
+        expected_request_id_seq_ids_mapping
+
+
+@torch.inference_mode()
+def test_handle_finished_requests():
+    """
+    Test to verify that finished request IDs are appropriately processed to 
+    update the internal state of the SpecDecodeWorker.
+
+    This test initializes the SpecDecodeWorker with mock data, marks certain 
+    requests as finished, and ensures that the corresponding sequence IDs are 
+    correctly removed from the internal mappings.
+    """
+    batch_size = 32
+    k = 3
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    worker = SpecDecodeWorker(draft_worker, target_worker,
+                              mock_spec_decode_sampler("rejection_sampler"),
+                              metrics_collector)
+    # Initialize the request_id_seq_id_mapping mapping dict with a few fake
+    # request ids and corresponding sequence ids.
+    worker._request_id_seq_id_mapping = \
+        {'request-1': {1,2,3}, 'request-2': {4,5,6,7},
+        'request-3': {8,9}, 'request-4': {10,11}}
+    # Initialize seq_with_bonus_token_in_last_step with a few fake
+    # sequence ids.
+    worker._seq_with_bonus_token_in_last_step = {1, 4, 5, 8, 9, 10}
+    exception_secret = 'artificial stop'
+    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
+
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
+    # Mark requests with ids request-1 and request-3 as finished.
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k,
+        finished_requests_ids=['request-1', 'request-3'])
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(execute_model_req=execute_model_req)
+    # Verify that request-1 and request-3 are removed from
+    # request_id_seq_id_mapping
+    assert worker._request_id_seq_id_mapping == \
+        {'request-2': {4,5,6,7}, 'request-4': {10,11}}
+    # Verify that all sequence ids corresponding to 'request-1'
+    # and 'request-3' are removed from seq_with_bonus_token_in_last_step.
+    assert worker._seq_with_bonus_token_in_last_step == \
+        {4,5,10}
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a3f998b94..1cebf68d4 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -3,8 +3,9 @@ import copy
 import enum
 import math
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 
@@ -916,6 +917,21 @@ def get_all_seq_ids(
     return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
 
 
+def get_all_seq_ids_and_request_ids(
+    seq_group_metadata_list: List[SequenceGroupMetadata]
+) -> Tuple[List[int], Dict[str, Set[int]]]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    seq_ids: List[int] = []
+    request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
+    for sg in seq_group_metadata_list:
+        for seq_id in sg.seq_data:
+            seq_ids.append(seq_id)
+            request_id_seq_ids_mapping[sg.request_id].add(seq_id)
+    return seq_ids, request_id_seq_ids_mapping
+
+
 class HiddenStates:
     """Hidden states corresponding to in-progress sequences.
     Used in speculative decoding to pass hidden states from
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index d236fc0f2..d109d8edc 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Set
 
 import torch
 
@@ -62,6 +62,9 @@ class SpeculativeProposer(ABC):
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        # If set, this contains all sequence IDs that were assigned
+        # bonus tokens in their last forward pass.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> SpeculativeProposals:
         raise NotImplementedError
 
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index b72740fc3..041ce41e9 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -1,5 +1,5 @@
 import weakref
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch
 
@@ -40,6 +40,8 @@ class MedusaWorker(NonLLMProposerWorkerBase, Worker):
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        # Unused parameter.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> Tuple[List[SamplerOutput], bool]:
         """Run the model forward pass to generate sample_len future tokens.
         Returns the list of sampler output, one per layer, along with indicator
@@ -97,12 +99,14 @@ class MedusaWorker(NonLLMProposerWorkerBase, Worker):
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> SpeculativeProposals:
         """Produce speculations given an input batch of sequences. The number of
         speculative tokens per sequence is determined by max_proposal_len.
         """
 
-        return self._proposer.get_spec_proposals(execute_model_req)
+        return self._proposer.get_spec_proposals(
+            execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
     def _raise_if_unsupported(
         self,
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index 6c1c8da57..308573348 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch
 
@@ -20,6 +20,9 @@ class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker):
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        # Unused parameter. MLPSpeculatorWorker does not use the KV Cache and
+        # therefore does not need this parameter.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> Tuple[List[SamplerOutput], bool]:
         """Run the model forward pass to generate sample_len future tokens.
         Returns the list of sampler output, one per layer, along with indicator
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index c1a02e1d3..09a77f9e8 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,6 +1,6 @@
 import copy
 import weakref
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple
 
 import torch
 
@@ -51,6 +51,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> Tuple[List[SamplerOutput], bool]:
         """Run the model forward pass sample_len times. Returns the list of
         sampler output, one per model forward pass, along with indicator of
@@ -60,44 +61,142 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
         For multi step worker, this indicator shall be True.
         """
         self._raise_if_unsupported(execute_model_req)
-
-        # Shallow copy input data so modifications (such as appending tokens)
-        # do not cause side-effects.
-        copied_seq_group_metadata_list = self._shallow_copy_inputs(
-            execute_model_req.seq_group_metadata_list)
-        copied_execute_model_req = execute_model_req.clone(
-            copied_seq_group_metadata_list)
-
+        # Expand the batch for sequences with a bonus token.
+        # Perform a forward pass on the expanded batch and filter the
+        # response to retain only the original sequences' responses.
+        expanded_request, indices_of_seq_with_bonus_tokens =\
+            self._expand_execute_model_request(
+                execute_model_req, seq_ids_with_bonus_token_in_last_step)
         # Run model sample_len times.
         model_outputs: List[SamplerOutput] = []
         if isinstance(self.model_runner, TP1DraftModelRunner):
-            copied_execute_model_req.num_steps = sample_len
+            expanded_request.num_steps = sample_len
             model_outputs = self.execute_model(
-                execute_model_req=copied_execute_model_req)
+                execute_model_req=expanded_request)
         else:
             # TODO: Remove this branch once DraftModelRunner supports TP>1.
             for _ in range(sample_len):
                 model_output: List[SamplerOutput] = super().execute_model(
-                    execute_model_req=copied_execute_model_req)
+                    execute_model_req=expanded_request)
                 assert (len(model_output) == 1
                         ), "composing multistep workers not supported"
                 model_output = model_output[0]
 
-                self._append_new_tokens(model_output,
-                                        copied_seq_group_metadata_list)
+                self._append_new_tokens(
+                    model_output, expanded_request.seq_group_metadata_list)
                 model_outputs.append(model_output)
 
-        return model_outputs, True
+        filtered_model_outputs = self._filter_model_output(
+            model_outputs, indices_of_seq_with_bonus_tokens)
+        return filtered_model_outputs, True
+
+    @staticmethod
+    def _expand_execute_model_request(
+        execute_model_req: ExecuteModelRequest,
+        seq_with_bonus_token_in_last_step: set,
+    ) -> Tuple[ExecuteModelRequest, List[int]]:
+        """
+        Expands the execute model request based on sequences with bonus
+        tokens.
+
+        For each sequence with a bonus token, this method creates a new
+        sequence without the bonus token and adds it to the execute model
+        request. The original sequence groups are also retained. The indices
+        of the original sequence groups are returned for further processing.
+
+        Args:
+            execute_model_req (ExecuteModelRequest): The original execute
+            model request.
+            seq_with_bonus_token_in_last_step (set): Set of sequence IDs that 
+            contain bonus tokens.
+
+        Returns:
+            Tuple[ExecuteModelRequest, List[int]]: The updated execute model
+            request with expanded sequences and a list of indices corresponding
+            to the original sequence groups.
+        """
+        updated_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        updated_execute_model_req = execute_model_req.clone(
+            updated_seq_group_metadata_list)
+        indices_of_original_sequence_groups = []
+        for seq_group in execute_model_req.seq_group_metadata_list:
+            seq_group_has_bonus_tokens = False
+            for seq_id, _ in seq_group.seq_data.items():
+                # Identify sequences with bonus tokens in the sequence group.
+                if seq_id in seq_with_bonus_token_in_last_step:
+                    seq_group_has_bonus_tokens = True
+                    break
+            if seq_group_has_bonus_tokens:
+                #Create new sequences without the last bonus token. These new
+                # sequence have the same sequence id as the original sequence.
+                # We create a new sequence group and add them there.
+                updated_seq_group_without_bonus_token  = \
+                    MultiStepWorker._copy_seq_metadata_excluding_last_token(
+                        seq_group, seq_with_bonus_token_in_last_step)
+                updated_seq_group_metadata_list.append(
+                    updated_seq_group_without_bonus_token)
+            # Add the original sequence group.
+            updated_seq_group_metadata_list.append(
+                MultiStepWorker._shallow_copy_seq_group_metadata(seq_group))
+            # Record the index of the original sequence group.
+            indices_of_original_sequence_groups.append(
+                len(updated_seq_group_metadata_list) - 1)
+
+        updated_execute_model_req.seq_group_metadata_list =\
+            updated_seq_group_metadata_list
+        return updated_execute_model_req, indices_of_original_sequence_groups
+
+    @staticmethod
+    def _filter_model_output(
+            expanded_batch_outputs: List[SamplerOutput],
+            output_indices_to_retain: List[int]) -> List[SamplerOutput]:
+        """
+        Filters the model output to include only the specified sequence
+        outputs. This method contracts the expanded batch output from the
+        model to retain the outputs of only those sequences indicated by the
+        provided indices.
+
+        Args:
+            expanded_batch_output (List[SamplerOutput]): The expanded output
+                batch from the model.
+            output_indices_to_retain (List[int]): Indices of the model outputs
+                to retain.
+
+        Returns:
+            List[SamplerOutput]: A list containing the filtered model 
+            outputs for the specified indices.
+        """
+        return [
+            SamplerOutput(
+                outputs=[
+                    expanded_batch_output.outputs[i]
+                    for i in output_indices_to_retain
+                ],
+                sampled_token_probs=(
+                    expanded_batch_output.
+                    sampled_token_probs[output_indices_to_retain]
+                    if expanded_batch_output.sampled_token_probs is not None
+                    else None),
+                logprobs=(
+                    expanded_batch_output.logprobs[output_indices_to_retain]
+                    if expanded_batch_output.logprobs is not None else None),
+                sampled_token_ids=(expanded_batch_output.
+                                   sampled_token_ids[output_indices_to_retain]
+                                   if expanded_batch_output.sampled_token_ids
+                                   is not None else None))
+            for expanded_batch_output in expanded_batch_outputs
+        ]
 
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: set,
     ) -> SpeculativeProposals:
         """Produce speculations given an input batch of sequences. The number of
         speculative tokens per sequence is determined by max_proposal_len.
         """
-
-        return self._proposer.get_spec_proposals(execute_model_req)
+        return self._proposer.get_spec_proposals(
+            execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
     @staticmethod
     def _append_new_tokens(
@@ -123,9 +222,8 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
                 seq.update_num_computed_tokens(1)
 
     @staticmethod
-    def _shallow_copy_inputs(
-        seq_group_metadata_list: List[SequenceGroupMetadata]
-    ) -> List[SequenceGroupMetadata]:
+    def _shallow_copy_seq_group_metadata(
+        seq_group_metadata: SequenceGroupMetadata, ) -> SequenceGroupMetadata:
         """Copy input data structures to remove side-effects when input data
         structures are shared with other modules.
 
@@ -133,26 +231,62 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
         The alternative is deep-copying (or other form of deep copy); this has
         performance downsides.
         """
-
-        # Shallow-copy the list of SequenceGroupMetadata. This allows us to
+        # Shallow-copy the SequenceGroupMetadata. This allows us to
         # append tokens and change is_prompt without external side-effects.
-        new_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        # We must shallow-copy seq_group_metadata as is_prompt could change.
+        new_seq_group_metadata = copy.copy(seq_group_metadata)
 
-        for old_seq_group_metadata in seq_group_metadata_list:
-            # We must shallow-copy seq_group_metadata as is_prompt could change.
-            seq_group_metadata = copy.copy(old_seq_group_metadata)
-            new_seq_group_metadata_list.append(seq_group_metadata)
-
-            # We must shallow-copy seq_data as we will append token ids
-            new_seq_data: Dict[int, SequenceData] = {}
-            for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
-                new_seq_data[seq_id] = copy.copy(old_seq_data)
-                new_seq_data[
-                    seq_id].output_token_ids = old_seq_data.output_token_ids[:]
+        # We must shallow-copy seq_data as we will append token ids
+        new_seq_data: Dict[int, SequenceData] = {}
+        for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
+            new_seq_data[seq_id] = copy.copy(old_seq_data)
+            new_seq_data[seq_id].output_token_ids =\
+                old_seq_data.output_token_ids[:]
 
-            seq_group_metadata.seq_data = new_seq_data
+        new_seq_group_metadata.seq_data = new_seq_data
+        return new_seq_group_metadata
 
-        return new_seq_group_metadata_list
+    @staticmethod
+    def _copy_seq_metadata_excluding_last_token(
+        seq_group_metadata: SequenceGroupMetadata,
+        seq_ids_to_copy: Set[int],
+    ) -> SequenceGroupMetadata:
+        """
+        Creates a shallow copy of the given SequenceGroupMetadata, retaining
+        only the sequence IDs specified in seq_ids_to_copy. For each of these
+        sequence IDs, all output_token_ids except the last one are copied.
+        Sequence IDs not in seq_ids_to_copy are excluded from the copy.
+        
+        Parameters:
+        seq_group_metadata (SequenceGroupMetadata): The original sequence
+            group metadata.
+        seq_ids_to_copy (Set[int]): The set of sequence IDs to include in the
+            copy.
+        
+        Returns:
+        SequenceGroupMetadata: A shallow copy of the sequence group metadata
+            with the specified modifications.
+        """
+        # Shallow-copy the SequenceGroupMetadata.
+        new_seq_group_metadata = copy.copy(seq_group_metadata)
+        # Shallow-copy seq_data and modify the output_token_ids.
+        new_seq_data: Dict[int, SequenceData] = {}
+        for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
+            if (seq_id in seq_ids_to_copy):
+                new_seq_data[seq_id] = copy.copy(old_seq_data)
+                # Copy all the output token ids except the last.
+                # Also reduce num_computed_tokens by 1 since we are not
+                # including the last output token.
+                # NOTE: num_computed_tokens is not directly used by the
+                # speculative decoding workers, as it is only relevant for
+                # chunked prefill, which is disabled for speculative decoding.
+                # However, to maintain consistency in num_computed_tokens,
+                # we update it here.
+                new_seq_data[seq_id].output_token_ids =\
+                    old_seq_data.output_token_ids[:-1]
+                new_seq_data[seq_id].update_num_computed_tokens(-1)
+        new_seq_group_metadata.seq_data = new_seq_data
+        return new_seq_group_metadata
 
     def _assert_enough_kv_space(
             self, seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 23a3e1649..07991df52 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,5 +1,5 @@
 import weakref
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch
 
@@ -48,6 +48,9 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        # Unused parameter. NGramWorker does not use the KV Cache and
+        # therefore does not need this parameter.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]:
         """NGram match algo to pick proposal candidate. Returns the list of
         sampler output, one per SequenceGroupMetadata.
@@ -133,12 +136,15 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        # Unused parameter. NGramWorker does not use the KV Cache and
+        # therefore does not need this parameter.
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> SpeculativeProposals:
         """Produce speculations given an input batch of sequences. The number of
         speculative tokens per sequence is determined by max_proposal_len.
         """
-
-        return self._proposer.get_spec_proposals(execute_model_req)
+        return self._proposer.get_spec_proposals(
+            execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
     def _raise_if_unsupported(
         self,
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index b691659fb..fffa55712 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposer
@@ -14,6 +14,13 @@ class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer):
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        # A set containing all sequence IDs that were assigned bonus tokens
+        # in their last forward pass. This set is used to backfill the KV cache
+        # with the key-value pairs of the penultimate token in the sequences.
+        # This parameter is only used by the MultiStepWorker, which relies on
+        # the KV cache for token generation. It is not used by workers that
+        # do not utilize the KV cache.
+        seq_ids_with_bonus_token_in_last_step: Set[int]
     ) -> Tuple[Optional[List[SamplerOutput]], bool]:
         raise NotImplementedError
 
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index b78e44895..0dbb924d2 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch
 
@@ -110,13 +110,17 @@ class SmallerTpProposerWorker(ProposerWorkerBase):
         self,
         execute_model_req: ExecuteModelRequest,
         sample_len: int,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> Tuple[List[SamplerOutput], bool]:
         # Do not check _is_dummy, as it's always called by get_spec_proposals
-        return self._worker.sampler_output(execute_model_req, sample_len)
+        return self._worker.sampler_output(
+            execute_model_req, sample_len,
+            seq_ids_with_bonus_token_in_last_step)
 
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> SpeculativeProposals:
         """Produce speculations given an input batch of sequences. The number of
         speculative tokens per sequence is determined by max_proposal_len.
@@ -125,7 +129,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase):
             return SpeculativeProposals(None, None, None)
 
         with self._patch_tensor_parallel_group():
-            return self._worker.get_spec_proposals(execute_model_req)
+            return self._worker.get_spec_proposals(
+                execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
     def execute_model(
         self,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 60a7dab68..3c8e3dee4 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,5 +1,6 @@
+from collections import defaultdict
 from functools import cached_property
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import torch
 
@@ -13,7 +14,7 @@ from vllm.model_executor.layers.typical_acceptance_sampler import (
     TypicalAcceptanceSampler)
 from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
                            HiddenStates, SamplerOutput, SequenceGroupMetadata,
-                           get_all_seq_ids)
+                           get_all_seq_ids_and_request_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
@@ -112,11 +113,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
         ngram_prompt_lookup_min = (
             draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
-
-        disable_bonus_tokens = True
-
         if ngram_prompt_lookup_max > 0:
-            disable_bonus_tokens = False
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
@@ -128,11 +125,9 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
 
             if draft_worker_kwargs[
                     "model_config"].hf_config.model_type == "mlp_speculator":
-                disable_bonus_tokens = False
                 proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
             elif draft_worker_kwargs[
                     "model_config"].hf_config.model_type == "medusa":
-                disable_bonus_tokens = False
                 proposer_worker = MedusaWorker(**draft_worker_kwargs)
             else:
                 if draft_tp == 1:
@@ -149,10 +144,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         spec_decode_sampler: SpecDecodeBaseSampler = None
         if draft_token_acceptance_method == "rejection_sampler":
             spec_decode_sampler = RejectionSampler(
-                disable_bonus_tokens=disable_bonus_tokens, )
+                disable_bonus_tokens=False, )
         elif draft_token_acceptance_method == "typical_acceptance_sampler":
             spec_decode_sampler = TypicalAcceptanceSampler(
-                disable_bonus_tokens=disable_bonus_tokens,
+                disable_bonus_tokens=False,
                 posterior_threshold=\
                     typical_acceptance_sampler_posterior_threshold,
                 posterior_alpha=typical_acceptance_sampler_posterior_alpha,
@@ -200,6 +195,15 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         self._metrics = AsyncMetricsCollector(
             self.spec_decode_sampler
         ) if metrics_collector is None else metrics_collector
+        # Tracks the sequence IDs that received a bonus token ID in
+        # their last forward pass. Needed only if KV cache is being
+        # used for token generation such as in the case of MultiStepWorker.
+        self._seq_with_bonus_token_in_last_step: Set[int] = set()
+        # Tracks the currently active request ids and the sequence IDs
+        # corresponding to them
+        self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set)
+        # Tracks if the proposer worker uses the KV cache or not.
+
         self.probs_dtype = self.spec_decode_sampler.probs_dtype
         self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
         # Lazy initiazliation.
@@ -307,6 +311,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             broadcast_tensor_dict({}, src=0)
             return []
 
+        self._track_finished_requests(execute_model_req)
         disable_all_speculation = self._should_disable_all_speculation(
             execute_model_req)
         num_lookahead_slots = execute_model_req.num_lookahead_slots
@@ -453,7 +458,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         self.previous_hidden_states = None
 
         # Generate proposals using draft worker.
-        proposals = self.proposer_worker.get_spec_proposals(execute_model_req)
+        proposals = self.proposer_worker.get_spec_proposals(
+            execute_model_req, self._seq_with_bonus_token_in_last_step)
 
         proposal_scores = self.scorer.score_proposals(
             execute_model_req,
@@ -585,7 +591,9 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
 
         # Get the sequence ids and num_logprobs (sampling parameter) in the
         # batch.
-        seq_ids = get_all_seq_ids(seq_group_metadata_list)
+        seq_ids, request_ids_seq_ids_mapping = get_all_seq_ids_and_request_ids(
+            seq_group_metadata_list)
+
         num_logprobs_per_seq = get_all_num_logprobs(seq_group_metadata_list)
 
         # Serialize all tensors to CPU Python lists.
@@ -608,7 +616,6 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             for sequence_index in range(batch_size):
                 # Each sequence may have a different num_logprobs; retrieve it.
                 num_logprobs = num_logprobs_per_seq[sequence_index]
-
                 step_output_token_ids.append(
                     create_sequence_group_output(
                         token_id=accepted_token_ids_by_step[step_index]
@@ -623,18 +630,48 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
                         topk_logprobs=topk_logprobs_by_step[step_index]
                         [sequence_index][:num_logprobs],
                     ))
-
             sampler_output_list.append(
                 SamplerOutput(outputs=step_output_token_ids))
 
+        # Populate the data structures needed to keep track of sequences with
+        # bonus tokens.
+        self._track_sequences_with_bonus_tokens(seq_ids,
+                                                request_ids_seq_ids_mapping,
+                                                accepted_token_ids_by_step)
         maybe_rejsample_metrics = (
             self._metrics.maybe_collect_rejsample_metrics(k))
         if maybe_rejsample_metrics is not None:
             sampler_output_list[
                 0].spec_decode_worker_metrics = maybe_rejsample_metrics
-
         return sampler_output_list
 
+    def _track_finished_requests(self, execute_model_req: ExecuteModelRequest):
+        """
+        Removes the finished requests and their associated sequence ids from
+        internal book keeping data structures.
+        """
+        for finished_request in execute_model_req.finished_requests_ids:
+            for seq_id in self._request_id_seq_id_mapping[finished_request]:
+                self._seq_with_bonus_token_in_last_step.discard(seq_id)
+            del self._request_id_seq_id_mapping[finished_request]
+
+    def _track_sequences_with_bonus_tokens(
+            self, seq_ids: List[int],
+            request_ids_seq_ids_mapping: Dict[str, Set[int]],
+            accepted_token_ids_by_step: List[List[int]]):
+        """
+        Updates the internal data structures which keep track of sequences
+        which have been assigned bonus tokens in their last forward pass.
+        """
+        for seq_index, seq_id in enumerate(seq_ids):
+            last_token_id = accepted_token_ids_by_step[-1][seq_index]
+            if last_token_id == -1:
+                self._seq_with_bonus_token_in_last_step.discard(seq_id)
+            else:
+                self._seq_with_bonus_token_in_last_step.add(seq_id)
+        for request_id, sequences in request_ids_seq_ids_mapping.items():
+            self._request_id_seq_id_mapping[request_id].update(sequences)
+
     @cached_property
     def _vocab_size(self) -> int:
         """Get the vocab size of the model and make sure it's consistent between
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index d3e280e68..7b34b5d34 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 import torch
 
@@ -42,6 +42,7 @@ class Top1Proposer(SpeculativeProposer):
     def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
     ) -> SpeculativeProposals:
         """Get speculative proposals given the input batch.
 
@@ -76,6 +77,8 @@ class Top1Proposer(SpeculativeProposer):
             maybe_sampler_output, transposed = self._worker.sampler_output(
                 execute_model_req=nonzero_execute_model_req,
                 sample_len=proposal_len,
+                seq_ids_with_bonus_token_in_last_step=\
+                    seq_ids_with_bonus_token_in_last_step,
             )
             (
                 proposal_lens,
-- 
GitLab


From 997df46a32f3b2c2debe3e17730895cef0d94d2a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 10 Jul 2024 16:39:02 -0700
Subject: [PATCH 306/376] [Bugfix][Neuron] Fix soft prompt method error in
 NeuronExecutor (#6313)

---
 vllm/executor/neuron_executor.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 53107dada..6b2cb3e24 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -70,6 +70,22 @@ class NeuronExecutor(ExecutorBase):
     def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+
     def check_health(self) -> None:
         # NeuronExecutor will always be healthy as long as
         # it's running.
-- 
GitLab


From 99ded1e1c4dc00baa77beae74602ebafe4921176 Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Thu, 11 Jul 2024 01:05:26 +0100
Subject: [PATCH 307/376] [Doc] Remove comments incorrectly copied from another
 project (#6286)

---
 vllm/model_executor/layers/linear.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1dda5d374..7100fe142 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -743,7 +743,6 @@ class RowParallelLinear(LinearBase):
         param_data.copy_(loaded_weight)
 
     def forward(self, input_):
-        # Set up backprop all-reduce.
         if self.input_is_parallel:
             input_parallel = input_
         else:
-- 
GitLab


From 439c84581aaf45917c6f77805a3511f1efc052bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Thu, 11 Jul 2024 12:15:29 +0800
Subject: [PATCH 308/376] [Doc] Update description of vLLM support for CPUs
 (#6003)

---
 README.md                                        | 2 +-
 docs/source/getting_started/cpu-installation.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3e0da945d..cced85f17 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support
 
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index a9544e8a5..1c97515db 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -20,7 +20,7 @@ Requirements
 
 * OS: Linux
 * Compiler: gcc/g++>=12.3.0 (optional, recommended)
-* Instruction set architecture (ISA) requirement: AVX512 is required.
+* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
 
 .. _cpu_backend_quick_start_dockerfile:
 
-- 
GitLab


From fc17110bbef4e78703abffac51133a2fb71e9f79 Mon Sep 17 00:00:00 2001
From: Lim Xiang Yang <xiangyang95@gmail.com>
Date: Thu, 11 Jul 2024 12:37:11 +0800
Subject: [PATCH 309/376] [BugFix]: set outlines pkg version (#6262)

---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index e874c4af4..b750f9a1b 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -18,7 +18,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.10.1
-outlines >= 0.0.43 # Requires torch >= 2.1.0
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
 pyzmq
-- 
GitLab


From c4774eb8418864390341d35103aa747fc411b59c Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 11 Jul 2024 00:04:05 -0700
Subject: [PATCH 310/376] [Bugfix] Fix snapshot download in serving benchmark
 (#6318)

---
 benchmarks/backend_request_func.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index fe29c6708..fbab547d0 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -390,17 +390,17 @@ def remove_prefix(text: str, prefix: str) -> str:
     return text
 
 
-def get_model(pretrained_model_name_or_path: str):
+def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
-    else:
-        from huggingface_hub import snapshot_download
-
-    model_path = snapshot_download(
-        model_id=pretrained_model_name_or_path,
-        local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-        ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-    return model_path
+
+        model_path = snapshot_download(
+            model_id=pretrained_model_name_or_path,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+
+        return model_path
+    return pretrained_model_name_or_path
 
 
 def get_tokenizer(
-- 
GitLab


From 3963a5335bb4106f2ecd1139527e3568d2151933 Mon Sep 17 00:00:00 2001
From: aniaan <hi@aniaan.dev>
Date: Thu, 11 Jul 2024 17:39:07 +0800
Subject: [PATCH 311/376] [Misc] refactor(config): clean up unused code (#6320)

---
 vllm/config.py                  | 6 ++----
 vllm/worker/xpu_model_runner.py | 3 ---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 68ca81a2e..d333a042f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -138,12 +138,10 @@ class ModelConfig:
         self.quantization = quantization
         self.quantization_param_path = quantization_param_path
         self.enforce_eager = enforce_eager
-        self.max_context_len_to_capture = max_context_len_to_capture
-        if self.max_context_len_to_capture is not None:
+        if max_context_len_to_capture is not None:
             raise ValueError("`max_context_len_to_capture` is deprecated. "
                              "Use `max_seq_len_to_capture` instead.")
-        self.max_seq_len_to_capture = (max_seq_len_to_capture
-                                       or max_context_len_to_capture)
+        self.max_seq_len_to_capture = max_seq_len_to_capture
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
         self.skip_tokenizer_init = skip_tokenizer_init
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index e03f24fdf..876abb3bf 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -109,9 +109,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
 
         self.kv_cache_dtype = kv_cache_dtype
         self.block_size = cache_config.block_size
-        self.max_context_len_to_capture = (
-            self.model_config.max_context_len_to_capture
-            if self.model_config is not None else 0)
 
         self.attn_backend = get_attn_backend(
             self.model_config.get_num_attention_heads(self.parallel_config),
-- 
GitLab


From 546b101fa05043feb470513a778c31114ea3aa05 Mon Sep 17 00:00:00 2001
From: pushan <62173185+pushan01@users.noreply.github.com>
Date: Thu, 11 Jul 2024 21:46:31 +0800
Subject: [PATCH 312/376] [BugFix]: fix engine timeout due to request abort
 (#6255)

Signed-off-by: yatta zhang <ytzhang01@foxmail.com>
Signed-off-by: zhangyuntao.dev <zhangyuntao.dev@bytedance.com>
Co-authored-by: zhangyuntao.dev <zhangyuntao.dev@bytedance.com>
---
 vllm/engine/async_llm_engine.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 9b4ef48b0..f3c8d69e4 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -553,11 +553,13 @@ class AsyncLLMEngine:
             request_outputs = await self.engine.step_async(virtual_engine)
 
         # Put the outputs into the corresponding streams.
+        finished = True
         for request_output in request_outputs:
             self._request_tracker.process_request_output(
                 request_output, verbose=self.log_requests)
+            finished = finished and request_output.finished
 
-        return len(request_outputs) > 0
+        return not finished
 
     async def _engine_abort(self, request_ids: Iterable[str]):
         if self.engine_use_ray:
-- 
GitLab


From 8a1415cf776b2b902f6429ecfc325877b57cbefe Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 11 Jul 2024 16:05:59 +0200
Subject: [PATCH 313/376] [Bugfix] GPTBigCodeForCausalLM: Remove lm_head from
 supported_lora_modules. (#6326)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/model_executor/models/gpt_bigcode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index cc42413d5..fc4e13bbb 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -235,7 +235,7 @@ class GPTBigCodeModel(nn.Module):
 class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
 
-    supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"]
+    supported_lora_modules = ["c_fc", "c_proj", "wte", "c_attn"]
 
     embedding_modules = {
         "wte": "input_embeddings",
-- 
GitLab


From 55f692b46ef35ed4a9e199dfe60a9eefe800e4b0 Mon Sep 17 00:00:00 2001
From: Mor Zusman <mor.zusmann@gmail.com>
Date: Thu, 11 Jul 2024 17:40:20 +0300
Subject: [PATCH 314/376] [BugFix] get_and_reset only when scheduler outputs
 are not empty (#6266)

---
 vllm/engine/async_llm_engine.py | 4 ++--
 vllm/engine/llm_engine.py       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index f3c8d69e4..93bf8793d 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -225,11 +225,11 @@ class _AsyncLLMEngine(LLMEngine):
         """
         seq_group_metadata_list, scheduler_outputs = self.scheduler[
             virtual_engine].schedule()
-        finished_requests_ids = self.scheduler[
-            virtual_engine].get_and_reset_finished_requests_ids()
 
         if not scheduler_outputs.is_empty():
             # Execute the model.
+            finished_requests_ids = self.scheduler[
+                virtual_engine].get_and_reset_finished_requests_ids()
             execute_model_req = ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
                 blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b476594fc..d354218cf 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -871,10 +871,10 @@ class LLMEngine:
                 "as performance will be severely degraded otherwise.")
         seq_group_metadata_list, scheduler_outputs = self.scheduler[
             0].schedule()
-        finished_requests_ids = self.scheduler[
-            0].get_and_reset_finished_requests_ids()
 
         if not scheduler_outputs.is_empty():
+            finished_requests_ids = self.scheduler[
+                0].get_and_reset_finished_requests_ids()
             execute_model_req = ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
                 blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
-- 
GitLab


From b675069d7486129dbed7847f420b7a927691f16b Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 11 Jul 2024 11:40:11 -0400
Subject: [PATCH 315/376] [ Misc ] Refactor Marlin Python Utilities (#6082)

Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
---
 benchmarks/kernels/benchmark_marlin.py        |  10 +-
 tests/kernels/test_marlin_gemm.py             |  46 +-
 tests/quantization/test_compressed_tensors.py |  23 +-
 .../schemes/compressed_tensors_wNa16.py       | 151 +++---
 .../model_executor/layers/quantization/fp8.py |   2 +-
 .../layers/quantization/gptq_marlin.py        | 263 +++--------
 .../quantization/utils/marlin_24_perms.py     |  60 ---
 .../layers/quantization/utils/marlin_perms.py |  60 ---
 .../layers/quantization/utils/marlin_utils.py | 439 ++++++------------
 .../quantization/utils/marlin_utils_fp8.py    | 109 +++++
 .../quantization/utils/marlin_utils_test.py   | 120 +++++
 .../{format_24.py => marlin_utils_test_24.py} | 163 ++++++-
 12 files changed, 704 insertions(+), 742 deletions(-)
 delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
 delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_perms.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
 rename vllm/model_executor/layers/quantization/utils/{format_24.py => marlin_utils_test_24.py} (71%)

diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 261f58296..3da4cecd7 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -5,14 +5,16 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MarlinWorkspace, marlin_24_quantize, marlin_quantize)
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace, marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     gptq_pack, quantize_weights, sort_weights)
 from vllm.utils import FlexibleArgumentParser
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 92ddcb209..3bd6680cf 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -5,19 +5,21 @@ Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
 import pytest
 import torch
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS,
-    marlin_permute_scales)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
-from vllm.model_executor.layers.quantization.utils.marlin_perms import (
-    marlin_perm)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MarlinWorkspace, compute_max_diff, is_marlin_supported, marlin_24_quantize,
-    marlin_quantize, marlin_weights, pack_fp8_to_int32)
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS,
+    marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    pack_fp8_to_int32)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace, get_weight_perm, marlin_quantize, marlin_weights)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     gptq_pack, quantize_weights, sort_weights)
 
@@ -42,11 +44,16 @@ MNK_FACTORS = [
 DTYPES = [torch.float16, torch.bfloat16]
 
 
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
 def rand_data(shape, dtype=torch.float16):
     return torch.randn(shape, dtype=dtype, device="cuda")
 
 
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@@ -93,8 +100,8 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Pack to Marlin format
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits,
-                                  marlin_perm[num_bits])
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
@@ -109,7 +116,7 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
     assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
 
 
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@@ -174,7 +181,7 @@ def test_marlin_gemm(
     assert max_diff < 0.04
 
 
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
@@ -222,7 +229,7 @@ def test_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size, mnk_factors):
     assert max_diff < 0.04
 
 
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@@ -268,13 +275,10 @@ def test_fp8_marlin_gemm(
     # expand it to channelwise
     scales = weight_scale.repeat(1, size_n).to(a_input.dtype).to("cuda")
     # Permute scales
-    marlin_scales = marlin_permute_scales(
-        s=scales,
-        size_k=size_k,
-        size_n=size_n,
-        group_size=-1,
-        num_bits=8,
-    )
+    marlin_scales = marlin_permute_scales(s=scales,
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=-1)
 
     workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                 GPTQ_MARLIN_MAX_PARALLEL)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 96223a247..888e20e51 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -6,7 +6,6 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
 import pytest
 import torch
 
-from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
@@ -57,12 +56,14 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
         assert qkv_proj.weight_scale.dtype is torch.float32
         assert qkv_proj.input_scale.dtype is torch.float32
 
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path) as llm:
-        sampling_params = SamplingParams()
-        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
 
 
@@ -84,13 +85,16 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
 
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
 
 @pytest.mark.parametrize(
     "wNa16_args",
     [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
      ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
      ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
-def test_compressed_tensors_w4a16(vllm_runner, wNa16_args):
+def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
@@ -101,12 +105,15 @@ def test_compressed_tensors_w4a16(vllm_runner, wNa16_args):
         assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
         assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.scheme.group_size == group
+        assert qkv_proj.scheme.group_size == (-1 if group is None else group)
 
         assert qkv_proj.weight_packed.dtype is torch.int32
         assert qkv_proj.weight_scale.dtype is torch.float16
         assert qkv_proj.weight_packed.pack_factor == pack_factor
 
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
 
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
@@ -120,8 +127,7 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
         assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
         assert qkv_proj.weight_packed.dtype is torch.int32
 
-        sampling_params = SamplingParams()
-        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
 
 
@@ -142,6 +148,5 @@ def test_compressed_tensors_fp8(vllm_runner):
         assert len(qkv_proj.input_scale.shape) == 0
         assert len(qkv_proj.weight_scale.shape) == 0
 
-        sampling_params = SamplingParams()
-        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 224326005..ed9fa73c1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -6,9 +6,10 @@ from torch.nn import Parameter
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState,
-    marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    apply_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
+    marlin_permute_scales, replace_tensor, verify_marlin_supported,
+    verify_marlin_supports_shape)
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsWNA16"]
@@ -22,29 +23,40 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
                  num_bits: int,
                  group_size: Optional[int] = None):
         self.num_bits = num_bits
+        self.pack_factor = 32 // self.num_bits
         self.strategy = strategy
-        self.group_size = group_size
 
-        if self.strategy == "group" and self.group_size is None:
-            raise ValueError(
-                "group_size must be given when using strategy group")
+        self.group_size: int
+        if group_size is None:
+            if self.strategy != "channel":
+                raise ValueError(
+                    "Marlin kernels require group quantization or "
+                    "channelwise quantization, but found no group "
+                    "size and strategy is not channelwise.")
+            self.group_size = -1
+        else:
+            self.group_size = group_size
 
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        pass
+        # Verify supported on platform.
+        verify_marlin_supported(num_bits=self.num_bits,
+                                group_size=self.group_size,
+                                is_sym=True)
 
     def create_weights(self, layer: torch.nn.Module, input_size: int,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
-
-        pack_factor = 32 // self.num_bits
         output_size_per_partition = sum(output_partition_sizes)
 
-        if self.group_size is not None:
-            group_size = self.group_size
-        else:
-            group_size = input_size
+        # If group_size is -1, we are in channelwise case.
+        group_size = input_size if self.group_size == -1 else self.group_size
+
+        verify_marlin_supports_shape(
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            input_size=input_size,
+            group_size=group_size)
 
         weight_scale_dim = None
         scales_and_zp_size = input_size // group_size
@@ -57,7 +69,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         weight = Parameter(
             torch.empty(
                 output_size_per_partition,
-                input_size_per_partition // pack_factor,
+                input_size_per_partition // self.pack_factor,
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -68,7 +80,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
                 "input_dim": 1,
                 "output_dim": 0,
                 "packed_dim": 1,
-                "pack_factor": pack_factor,
+                "pack_factor": self.pack_factor,
                 "weight_loader": weight_loader
             })
         layer.register_parameter("weight_packed", weight)
@@ -103,73 +115,48 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
 
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
-
         layer.input_size = input_size
-        layer.marlin_state = GPTQMarlinState.REPACK
-        layer.is_k_full = True
         layer.group_size = group_size
 
-        max_workspace_size = (
-            output_size_per_partition //
-            GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
-
-        workspace = torch.zeros(max_workspace_size,
-                                dtype=torch.int,
-                                requires_grad=False)
-        layer.workspace = workspace
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from marlin format. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = layer.weight_packed.device
+
+        # Allocate marlin workspace.
+        layer.workspace = marlin_make_workspace(
+            layer.output_size_per_partition, device)
+
+        # Act-order not supported in compressed-tensors yet, so set to empty.
+        layer.g_idx = marlin_make_empty_g_idx(device)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        # Repack weights from compressed-tensors format to marlin format.
+        marlin_qweight = ops.gptq_marlin_repack(
+            layer.weight_packed.t().contiguous(),
+            perm=layer.g_idx_sort_indices,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.num_bits)
+        replace_tensor(layer, "weight_packed", marlin_qweight)
+
+        # Permute scales from compressed-tensors format to marlin format.
+        marlin_scales = marlin_permute_scales(
+            layer.weight_scale.squeeze().t().contiguous(),
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            group_size=layer.group_size)
+        replace_tensor(layer, "weight_scale", marlin_scales)
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        reshaped_x = x.reshape(-1, x.shape[-1])
-
-        size_m = reshaped_x.shape[0]
-        part_size_n = layer.output_size_per_partition
-        part_size_k = layer.input_size_per_partition
-
-        out_shape = x.shape[:-1] + (part_size_n, )
-
-        if layer.marlin_state == GPTQMarlinState.REPACK:
-            layer.marlin_state = GPTQMarlinState.READY
-
-            # Newly generated tensors need to replace existing tensors that are
-            # already registered as parameters by vLLM (and won't be freed)
-            def replace_tensor(name, new_t):
-                # It is important to use resize_() here since it ensures
-                # the same buffer is reused
-                getattr(layer, name).resize_(new_t.shape)
-                getattr(layer, name).copy_(new_t)
-                del new_t
-
-            cur_device = layer.weight_packed.device
-
-            # Reset g_idx related tensors
-            layer.g_idx = Parameter(torch.empty(0,
-                                                dtype=torch.int,
-                                                device=cur_device),
-                                    requires_grad=False)
-            layer.g_idx_sort_indices = Parameter(torch.empty(
-                0, dtype=torch.int, device=cur_device),
-                                                 requires_grad=False)
-
-            # Repack weights
-            marlin_qweight = ops.gptq_marlin_repack(
-                layer.weight_packed.t().contiguous(), layer.g_idx_sort_indices,
-                part_size_k, part_size_n, self.num_bits)
-
-            replace_tensor("weight_packed", marlin_qweight)
-
-            # Permute scales
-            scales_size_k = part_size_k
-            scales_size_n = part_size_n
-
-            marlin_scales = marlin_permute_scales(
-                layer.weight_scale.squeeze().t().contiguous(), scales_size_k,
-                scales_size_n, layer.group_size, self.num_bits)
-            replace_tensor("weight_scale", marlin_scales)
-
-        output = ops.gptq_marlin_gemm(reshaped_x, layer.weight_packed,
-                                      layer.weight_scale, layer.g_idx,
-                                      layer.g_idx_sort_indices,
-                                      layer.workspace, self.num_bits, size_m,
-                                      part_size_n, part_size_k,
-                                      layer.is_k_full)
-        return output.reshape(out_shape)
+        return apply_marlin_linear(
+            input=x,
+            weight=layer.weight_packed,
+            weight_scale=layer.weight_scale,
+            g_idx=layer.g_idx,
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=layer.workspace,
+            num_bits=self.num_bits,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            is_k_full=True)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8dba9019f..0c2d2bd3f 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -11,7 +11,7 @@ from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, apply_fp8_linear, create_per_tensor_scale_param,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 6b971f73d..7b808f521 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,5 +1,3 @@
-import enum
-from enum import Enum
 from typing import Any, Dict, List, Optional
 
 import torch
@@ -12,46 +10,14 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_K,
-    GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_SUPPORTED_GROUP_SIZES,
-    GPTQ_MARLIN_SUPPORTED_NUM_BITS, GPTQ_MARLIN_SUPPORTED_SYM,
-    GPTQ_MARLIN_TILE)
+    check_marlin_supported, marlin_make_empty_g_idx, marlin_make_workspace,
+    marlin_permute_scales, marlin_sort_g_idx, replace_tensor,
+    verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
 
-# Permutations for Marlin scale shuffling
-def get_scale_perms(num_bits: int):
-    scale_perm: List[int] = []
-    for i in range(8):
-        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
-    for i in range(4):
-        scale_perm_single.extend(
-            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return scale_perm, scale_perm_single
-
-
-def get_pack_factor(num_bits: int):
-    assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
-            ), f"Unsupported num_bits = {num_bits}"
-    return 32 // num_bits
-
-
-def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
-                          group_size: int, num_bits: int):
-    scale_perm, scale_perm_single = get_scale_perms(num_bits)
-    if group_size < size_k and group_size != -1:
-        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
-    else:
-        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
-    s = s.reshape((-1, size_n)).contiguous()
-
-    return s
-
-
 class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
@@ -63,33 +29,16 @@ class GPTQMarlinConfig(QuantizationConfig):
             desc_act = False
 
         self.weight_bits = weight_bits
+        self.pack_factor = 32 // self.weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
         self.is_sym = is_sym
         self.lm_head_quantized = lm_head_quantized
 
-        # Verify
-        if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
-            raise ValueError(
-                f"Marlin does not support weight_bits = {self.weight_bits}. "
-                f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
-                "are supported.")
-        if self.group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
-            raise ValueError(
-                f"Marlin does not support group_size = {self.group_size}. "
-                f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
-                "are supported.")
-        if self.is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
-            raise ValueError(
-                f"Marlin does not support is_sym = {self.is_sym}. "
-                f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
-
-        # Init
-        self.pack_factor = get_pack_factor(weight_bits)
-        self.tile_size = GPTQ_MARLIN_TILE
-        self.min_thread_n = GPTQ_MARLIN_MIN_THREAD_N
-        self.min_thread_k = GPTQ_MARLIN_MIN_THREAD_K
-        self.max_parallel = GPTQ_MARLIN_MAX_PARALLEL
+        # Verify supported on platform.
+        verify_marlin_supported(num_bits=self.weight_bits,
+                                group_size=self.group_size,
+                                is_sym=self.is_sym)
 
     def __repr__(self) -> str:
         return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
@@ -168,21 +117,10 @@ class GPTQMarlinConfig(QuantizationConfig):
                 or desc_act is None):
             return False
 
-        # If the capability of the device is too low, cannot convert.
-        major, minor = current_platform.get_device_capability()
-        device_capability = major * 10 + minor
-        if device_capability < cls.get_min_capability():
-            return False
-
-        # Otherwise, can convert if model satisfies marlin constraints.
-        return (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
-                and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
-                and sym in GPTQ_MARLIN_SUPPORTED_SYM)
-
-
-class GPTQMarlinState(Enum):
-    REPACK = enum.auto()
-    READY = enum.auto()
+        return check_marlin_supported(num_bits=num_bits,
+                                      group_size=group_size,
+                                      is_sym=sym,
+                                      min_capability=cls.get_min_capability())
 
 
 class GPTQMarlinLinearMethod(LinearMethodBase):
@@ -206,6 +144,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         **extra_weight_attrs,
     ) -> None:
         del output_size
+        output_size_per_partition = sum(output_partition_sizes)
 
         # Normalize group_size
         if self.quant_config.group_size != -1:
@@ -213,31 +152,11 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         else:
             group_size = input_size
 
-        # Validate dtype
-        if params_dtype not in [torch.float16, torch.bfloat16]:
-            raise ValueError(f"The params dtype must be float16 "
-                             f"or bfloat16, but got {params_dtype}")
-
-        # Validate output_size_per_partition
-        output_size_per_partition = sum(output_partition_sizes)
-        if output_size_per_partition % self.quant_config.min_thread_n != 0:
-            raise ValueError(
-                f"Weight output_size_per_partition = "
-                f"{output_size_per_partition} is not divisible by "
-                f" min_thread_n = {self.quant_config.min_thread_n}.")
-
-        # Validate input_size_per_partition
-        if input_size_per_partition % self.quant_config.min_thread_k != 0:
-            raise ValueError(
-                f"Weight input_size_per_partition = "
-                f"{input_size_per_partition} is not divisible "
-                f"by min_thread_k = {self.quant_config.min_thread_k}.")
-
-        if (group_size < input_size
-                and input_size_per_partition % group_size != 0):
-            raise ValueError(
-                f"Weight input_size_per_partition = {input_size_per_partition}"
-                f" is not divisible by group_size = {group_size}.")
+        verify_marlin_supports_shape(
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            input_size=input_size,
+            group_size=group_size)
 
         # Detect sharding of scales/zp
 
@@ -303,11 +222,6 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
             },
         )
 
-        g_idx_sort_indices = torch.empty(
-            g_idx.shape,
-            dtype=torch.int32,
-        )
-
         # Scales
         scales = Parameter(
             torch.empty(
@@ -347,25 +261,50 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
             },
         )
 
-        # Allocate marlin workspace
-        max_workspace_size = (
-            output_size_per_partition //
-            self.quant_config.min_thread_n) * self.quant_config.max_parallel
-        workspace = torch.zeros(max_workspace_size,
-                                dtype=torch.int,
-                                requires_grad=False)
-
         layer.register_parameter("qweight", qweight)
         layer.register_parameter("g_idx", g_idx)
         layer.register_parameter("scales", scales)
         layer.register_parameter("qzeros", qzeros)
-        layer.g_idx_sort_indices = g_idx_sort_indices
-        layer.workspace = workspace
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
         layer.is_k_full = is_k_full
-        layer.marlin_state = GPTQMarlinState.REPACK
+
+    # Checkpoints are serialized in AutoGPTQ format, which is different from the
+    # marlin format. This function is called after the weights are loaded.
+    # Here, we handle the repacking, including the activation reordering case.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = layer.qweight.device
+        # Allocate marlin workspace
+        layer.workspace = marlin_make_workspace(
+            layer.output_size_per_partition, device)
+
+        # Handle sorting for activation reordering if needed.
+        if self.quant_config.desc_act:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+            replace_tensor(layer, "g_idx", g_idx)
+        else:
+            layer.g_idx = marlin_make_empty_g_idx(device)
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        # Repack weights from autogptq format to marlin format.
+        marlin_qweight = ops.gptq_marlin_repack(
+            layer.qweight,
+            perm=layer.g_idx_sort_indices,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.weight_bits)
+        replace_tensor(layer, "qweight", marlin_qweight)
+
+        # Permute scales from autogptq format to marlin format.
+        marlin_scales = marlin_permute_scales(
+            layer.scales,
+            size_k=(layer.input_size if self.quant_config.desc_act else
+                    layer.input_size_per_partition),
+            size_n=layer.output_size_per_partition,
+            group_size=self.quant_config.group_size)
+        replace_tensor(layer, "scales", marlin_scales)
 
     def apply(
         self,
@@ -374,87 +313,19 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         reshaped_x = x.reshape(-1, x.shape[-1])
-
-        size_m = reshaped_x.shape[0]
-        part_size_n = layer.output_size_per_partition
-        part_size_k = layer.input_size_per_partition
-        full_size_k = layer.input_size
-
-        out_shape = x.shape[:-1] + (part_size_n, )
-
-        if layer.marlin_state == GPTQMarlinState.REPACK:
-            layer.marlin_state = GPTQMarlinState.READY
-
-            # Newly generated tensors need to replace existing tensors that are
-            # already registered as parameters by vLLM (and won't be freed)
-            def replace_tensor(name, new_t):
-                # It is important to use resize_() here since it ensures
-                # the same buffer is reused
-                getattr(layer, name).resize_(new_t.shape)
-                getattr(layer, name).copy_(new_t)
-                del new_t
-
-            cur_device = layer.qweight.device
-
-            # Process act_order
-            if self.quant_config.desc_act:
-                # Get sorting based on g_idx
-                g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
-
-                sorted_g_idx = layer.g_idx[g_idx_sort_indices]
-
-                replace_tensor("g_idx", sorted_g_idx)
-                replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
-
-            else:
-                # Reset g_idx related tensors
-                layer.g_idx = Parameter(
-                    torch.empty(0, dtype=torch.int, device=cur_device),
-                    requires_grad=False,
-                )
-                layer.g_idx_sort_indices = Parameter(
-                    torch.empty(0, dtype=torch.int, device=cur_device),
-                    requires_grad=False,
-                )
-
-            # Repack weights
-            marlin_qweight = ops.gptq_marlin_repack(
-                layer.qweight,
-                layer.g_idx_sort_indices,
-                part_size_k,
-                part_size_n,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("qweight", marlin_qweight)
-
-            # Permute scales
-            scales_size_k = part_size_k
-            scales_size_n = part_size_n
-            if self.quant_config.desc_act:
-                scales_size_k = full_size_k
-
-            marlin_scales = marlin_permute_scales(
-                layer.scales,
-                scales_size_k,
-                scales_size_n,
-                self.quant_config.group_size,
-                self.quant_config.weight_bits,
-            )
-            replace_tensor("scales", marlin_scales)
-
-        output = ops.gptq_marlin_gemm(
-            reshaped_x,
-            layer.qweight,
-            layer.scales,
-            layer.g_idx,
-            layer.g_idx_sort_indices,
-            layer.workspace,
-            self.quant_config.weight_bits,
-            size_m,
-            part_size_n,
-            part_size_k,
-            layer.is_k_full,
-        )
+        out_shape = x.shape[:-1] + (layer.output_size_per_partition, )
+
+        output = ops.gptq_marlin_gemm(reshaped_x,
+                                      layer.qweight,
+                                      layer.scales,
+                                      g_idx=layer.g_idx,
+                                      perm=layer.g_idx_sort_indices,
+                                      workspace=layer.workspace,
+                                      num_bits=self.quant_config.weight_bits,
+                                      size_m=reshaped_x.shape[0],
+                                      size_n=layer.output_size_per_partition,
+                                      size_k=layer.input_size_per_partition,
+                                      is_k_full=layer.is_k_full)
 
         if bias is not None:
             output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
deleted file mode 100644
index 93f65a20d..000000000
--- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""This file is used for /tests and /benchmarks"""
-from typing import Dict, List
-
-import numpy
-import torch
-
-
-# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501
-#
-# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
-# with the tensor-core format that is described here:
-# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
-#
-# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
-# (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms_24(num_bits: int):
-    perm_list: List[int] = []
-    for i in range(32):
-        perm1: List[int] = []
-        col = i // 4
-        col_o = col // 2
-        for block in [0, 1]:
-            for row in [
-                    2 * (i % 4),
-                    2 * (i % 4) + 1,
-                    2 * (i % 4 + 4),
-                    2 * (i % 4 + 4) + 1,
-            ]:
-                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
-                             4 * block)
-        for j in range(4):
-            perm_list.extend([p + 1 * j for p in perm1])
-    perm = numpy.array(perm_list)
-
-    if num_bits == 4:
-        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
-    elif num_bits == 8:
-        interleave = numpy.array([0, 2, 1, 3])
-    else:
-        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
-
-    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
-    perm = torch.from_numpy(perm)
-    scale_perm: List[int] = []
-    for i in range(8):
-        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
-    scale_perm_single: List[int] = []
-    for i in range(8):
-        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
-    return perm, scale_perm, scale_perm_single
-
-
-marlin_24_perm: Dict[int, torch.Tensor] = {}
-marlin_24_scale_perm: Dict[int, List[int]] = {}
-marlin_24_scale_perm_single: Dict[int, List[int]] = {}
-for num_bits in [4, 8]:
-    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
-    marlin_24_perm[num_bits] = perm_24
-    marlin_24_scale_perm[num_bits] = scale_perm_24
-    marlin_24_scale_perm_single[num_bits] = scale_perm_single_24
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
deleted file mode 100644
index db5e6857a..000000000
--- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""This file is used for /tests and /benchmarks"""
-from typing import Dict, List
-
-import numpy
-import torch
-
-
-# Precompute permutations for Marlin weight and scale shuffling # noqa: E501
-#
-# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
-# with the tensor-core format that is described here:
-# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
-#
-# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
-# (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms(num_bits: int):
-    perm_list: List[int] = []
-    for i in range(32):
-        perm1: List[int] = []
-        col = i // 4
-        for block in [0, 1]:
-            for row in [
-                    2 * (i % 4),
-                    2 * (i % 4) + 1,
-                    2 * (i % 4 + 4),
-                    2 * (i % 4 + 4) + 1,
-            ]:
-                perm1.append(16 * row + col + 8 * block)
-        for j in range(4):
-            perm_list.extend([p + 256 * j for p in perm1])
-
-    perm = numpy.array(perm_list)
-
-    if num_bits == 4:
-        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
-    elif num_bits == 8:
-        interleave = numpy.array([0, 2, 1, 3])
-    else:
-        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
-
-    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
-    perm = torch.from_numpy(perm)
-    scale_perm: List[int] = []
-    for i in range(8):
-        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
-    for i in range(4):
-        scale_perm_single.extend(
-            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return perm, scale_perm, scale_perm_single
-
-
-marlin_perm: Dict[int, torch.Tensor] = {}
-marlin_scale_perm: Dict[int, List[int]] = {}
-marlin_scale_perm_single: Dict[int, List[int]] = {}
-for num_bits in [4, 8]:
-    perm, scale_perm, scale_perm_single = get_perms(num_bits)
-    marlin_perm[num_bits] = perm
-    marlin_scale_perm[num_bits] = scale_perm
-    marlin_scale_perm_single[num_bits] = scale_perm_single
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 988624526..612c5fd20 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,21 +1,9 @@
-"""This file is used for /tests and /benchmarks"""
-import random
-from typing import Optional
+from typing import List, Optional, Tuple
 
-import numpy
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.format_24 import (
-    mask_creator, sparse_semi_structured_from_dense_cutlass)
-from vllm.model_executor.layers.quantization.utils.marlin_24_perms import (
-    marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single)
-from vllm.model_executor.layers.quantization.utils.marlin_perms import (
-    marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    get_pack_factor, quantize_weights, sort_weights)
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 GPTQ_MARLIN_TILE = 16
 GPTQ_MARLIN_MIN_THREAD_N = 64
@@ -25,135 +13,110 @@ GPTQ_MARLIN_MAX_PARALLEL = 16
 GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
 GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 GPTQ_MARLIN_SUPPORTED_SYM = [True]
-
-
-def is_marlin_supported():
-    capability = current_platform.get_device_capability()
-    return capability[0] >= 8
-
-
-def apply_fp8_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    workspace: torch.Tensor,
-    size_n: int,
-    size_k: int,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    # For GPUs that lack FP8 hardware support, we can leverage the
-    # Marlin kernel for fast weight-only FP8 quantization
-
-    reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (size_n, )
-
-    output = ops.fp8_marlin_gemm(
-        a=reshaped_x,
-        b_q_weight=weight,
-        b_scales=weight_scale,
-        workspace=workspace,
-        num_bits=8,
-        size_m=reshaped_x.shape[0],
-        size_n=size_n,
-        size_k=size_k,
-    )
-
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
-    return output.reshape(out_shape)
-
-
-def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
-    print_warning_once(
-        "Your GPU does not have native support for FP8 computation but "
-        "FP8 quantization is being used. Weight-only FP8 compression will "
-        "be used leveraging the Marlin kernel. This may degrade "
-        "performance for compute-heavy workloads.")
-
-    part_size_n = layer.output_size_per_partition
-    part_size_k = layer.input_size_per_partition
-
-    device = layer.weight.device
-
-    # WEIGHTS
-    # Repack weights to gptq format (packed int32 elements)
-    packed_gptq_qweight = pack_fp8_to_int32(layer.weight)
-
-    # Repack weights to marlin format
-    marlin_qweight = ops.gptq_marlin_repack(
-        b_q_weight=packed_gptq_qweight,
-        perm=torch.empty(0, dtype=torch.int, device=device),
-        size_k=part_size_k,
-        size_n=part_size_n,
-        num_bits=8,
-    )
-    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
-
-    # WEIGHT SCALES
-    # Currently Marlin doesn't support per-tensor scales, so we
-    # expand it to channelwise
-    scales = layer.weight_scale.repeat(1, part_size_n).to(
-        layer.orig_dtype).to(device)
-    # Permute scales
-    num_bits = 8
-    marlin_scales = marlin_permute_scales(
-        s=scales,
-        size_k=part_size_k,
-        size_n=part_size_n,
-        group_size=-1,
-        scale_perm=marlin_scale_perm[num_bits],
-        scale_perm_single=marlin_scale_perm_single[num_bits])
-    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
-
-    # Allocate marlin workspace
-    max_workspace_size = (part_size_n //
+GTPQ_MARLIN_UNSUPPORTED_GROUP_SIZE_ACT_ORDER = [-1]
+
+
+def check_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
+                           min_capability: int) -> bool:
+
+    # If the capability of the device is too low, cannot convert.
+    major, minor = current_platform.get_device_capability()
+    device_capability = major * 10 + minor
+    if device_capability < min_capability:
+        return False
+
+    return (device_capability >= min_capability
+            and num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
+            and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
+            and is_sym in GPTQ_MARLIN_SUPPORTED_SYM)
+
+
+def verify_marlin_supported(num_bits: int, group_size: Optional[int],
+                            is_sym: bool) -> None:
+
+    if num_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+        raise ValueError(
+            f"Marlin does not support weight_bits = {num_bits}. "
+            f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
+            "are supported.")
+    if (group_size is None
+            or group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES):
+        raise ValueError(
+            f"Marlin does not support group_size = {group_size}. "
+            f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
+            "are supported.")
+    if is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
+        raise ValueError(
+            f"Marlin does not support is_sym = is_sym. "
+            f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
+
+
+def verify_marlin_supports_shape(output_size_per_partition: int,
+                                 input_size_per_partition: int,
+                                 input_size: int, group_size: int) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}."
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+
+
+def marlin_make_workspace(output_size_per_partition: int,
+                          device: torch.device) -> torch.Tensor:
+    max_workspace_size = (output_size_per_partition //
                           GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device=device,
-                            requires_grad=False)
-
-    layer.workspace = workspace
-
-
-def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
-    assert q_w.shape == (size_k, size_n)
-    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
-    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
-
-    # Permute weights to 16x64 marlin tiles
-    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
-    q_w = q_w.permute((0, 2, 1, 3))
-    q_w = q_w.reshape((size_k // tile, size_n * tile))
 
-    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
+    return torch.zeros(max_workspace_size,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
 
-    return q_w
 
+def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
 
-def marlin_weights(q_w, size_k, size_n, num_bits, perm):
-    # Permute
-    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
 
-    # Pack
-    pack_factor = get_pack_factor(num_bits)
-    orig_device = q_w.device
+def marlin_sort_g_idx(
+        g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
 
-    q_w = q_w.cpu().numpy().astype(numpy.uint32)
 
-    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
-                           dtype=numpy.uint32)
-    for i in range(pack_factor):
-        q_packed |= q_w[:, i::pack_factor] << num_bits * i
+def get_scale_perms():
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: List[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
 
-    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
 
-    return q_packed
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> torch.Tensor:
 
-
-def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
-                          scale_perm_single):
+    scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
     else:
@@ -163,180 +126,44 @@ def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
     return s
 
 
-def marlin_quantize(
-    w: torch.Tensor,
-    num_bits: int,
-    group_size: int,
-    act_order: bool,
-):
-    size_k, size_n = w.shape
-
-    # Normalize group_size
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
-
-    # Quantize (and apply act_order if provided)
-    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
-                                                       act_order)
-
-    # For act_order, sort the "weights" and "g_idx" so that group ids are
-    # increasing
-    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
-    if act_order:
-        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
-
-    # Reformat to marlin
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
-                                marlin_perm[num_bits])
-    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
-                                     marlin_scale_perm[num_bits],
-                                     marlin_scale_perm_single[num_bits])
-
-    # Create result
-    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
-    for i in range(len(res_list)):
-        res_list[i] = res_list[i].to(w.device)
-
-    return res_list
-
-
-def inject_24(w, size_k, size_n):
-    assert w.shape == (size_k, size_n)
-
-    mask = mask_creator(w.t()).t().cuda().bool()
-
-    return (mask * w).contiguous(), mask.contiguous()
-
-
-def check_24(w, num_rows_to_sample=50, _verbose=False):
-    BLOCK_SIZE = 4
-    MAX_NON_ZEROS = 2
-
-    w = w.t().contiguous()
-
-    print("check_24: w.shape = {}".format(w.shape))
-
-    num_rows, num_cols = w.shape
-    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
-    if _verbose:
-        print(f"Sampled row idxs = {sampled_row_idxs}")
-
-    total_segments = 0
-    non_24_segments = 0
-    for i in sampled_row_idxs:
-        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
-            total_segments += 1
-            block = w[i, j:j + BLOCK_SIZE]
-            num_nonzero = torch.count_nonzero(block)
-            if num_nonzero > MAX_NON_ZEROS:
-                print("i = {} j = {} block = {}".format(i, j, block))
-                non_24_segments += 1
-
-    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
-
-
-def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
-    assert q_24.shape == (size_k, size_n)
-
-    # Remove zp to normalize over 0
-    max_q_val = (1 << num_bits) - 1
-    zp = (max_q_val + 1) // 2
-    q_24_no_zp = q_24 - zp
-
-    # Compress
-    q_24_no_zp = q_24_no_zp.t().contiguous()
-    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
-        q_24_no_zp)
-    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
-
-    # Restore zp
-    q_24_comp = q_24_no_zp_comp + zp
-
-    # Resize meta to its actual shape (without moving any data)
-    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
-
-    return q_24_comp, meta
-
-
-def marlin_24_quantize(
-    w: torch.Tensor,
-    num_bits: int,
-    group_size: int,
-):
-    size_k, size_n = w.shape
-
-    # Normalize group_size
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
-
-    # Inject 2:4 sparsity
-    w_24, mask_24 = inject_24(w, size_k, size_n)
-
-    # Quantize
-    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
-                                                             num_bits,
-                                                             group_size,
-                                                             act_order=False)
-
-    # Compress quantized weight
-    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
-                                                     num_bits)
-    size_k_comp = size_k // 2
-
-    # Reformat to marlin
-    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
-                                        num_bits, marlin_24_perm[num_bits])
-    marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size,
-                                        marlin_24_scale_perm[num_bits],
-                                        marlin_24_scale_perm_single[num_bits])
-
-    # Create result
-    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
-    for i in range(len(res_list)):
-        res_list[i] = res_list[i].to(w.device)
-
-    return res_list
-
-
-def compute_max_diff(output, output_ref):
-    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
-
-
-class MarlinWorkspace:
-
-    def __init__(self, out_features, min_thread_n, max_parallel):
-        assert (out_features % min_thread_n == 0), (
-            "out_features = {} is undivisible by min_thread_n = {}".format(
-                out_features, min_thread_n))
-
-        max_workspace_size = ((out_features // min_thread_n) * max_parallel)
-
-        self.scratch = torch.zeros(max_workspace_size,
-                                   dtype=torch.int,
-                                   device="cuda")
-
-
-def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
-    """
-    Repack FP8 weights to gptq format (packed int32 elements)
-    """
-    assert fp8_tensor.dtype == torch.float8_e4m3fn
-    assert fp8_tensor.shape[0] % 4 == 0
-
-    # Reshape to prepare for packing
-    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
-
-    # Convert fp8 to uint8 (byte) representation
-    byte_tensor = reshaped.view(torch.uint8)
+# Newly generated tensors need to replace existing tensors that are
+# already registered as parameters by vLLM (and won't be freed)
+def replace_tensor(layer: torch.nn.Module, name: str,
+                   new_t: torch.Tensor) -> None:
+    # It is important to use resize_() here since it ensures
+    # the same buffer is reused
+    getattr(layer, name).resize_(new_t.shape)
+    getattr(layer, name).copy_(new_t)
+    del new_t
+
+
+def apply_marlin_linear(input: torch.Tensor,
+                        weight: torch.Tensor,
+                        weight_scale: torch.Tensor,
+                        g_idx: torch.Tensor,
+                        g_idx_sort_indices: torch.Tensor,
+                        workspace: torch.Tensor,
+                        num_bits: int,
+                        output_size_per_partition: int,
+                        input_size_per_partition: int,
+                        is_k_full: bool,
+                        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  weight,
+                                  weight_scale,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  num_bits,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  is_k_full=is_k_full)
 
-    # Pack 4 uint8 values into one int32
-    packed = (byte_tensor[:, 0].to(torch.int32) |
-              (byte_tensor[:, 1].to(torch.int32) << 8) |
-              (byte_tensor[:, 2].to(torch.int32) << 16) |
-              (byte_tensor[:, 3].to(torch.int32) << 24))
+    if bias is not None:
+        output.add_(bias)  # In-place add
 
-    return packed.view(fp8_tensor.shape[0] // 4,
-                       *fp8_tensor.shape[1:]).contiguous()
+    return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
new file mode 100644
index 000000000..e93eb747b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -0,0 +1,109 @@
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils import print_warning_once
+
+from .marlin_utils import marlin_make_workspace, marlin_permute_scales
+
+
+def is_fp8_marlin_supported():
+    capability = current_platform.get_device_capability()
+    return capability[0] >= 8
+
+
+def apply_fp8_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    workspace: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    # For GPUs that lack FP8 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP8 quantization
+
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+
+    output = ops.fp8_marlin_gemm(
+        a=reshaped_x,
+        b_q_weight=weight,
+        b_scales=weight_scale,
+        workspace=workspace,
+        num_bits=8,
+        size_m=reshaped_x.shape[0],
+        size_n=size_n,
+        size_k=size_k,
+    )
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
+    print_warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+
+    device = layer.weight.device
+
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace(part_size_n, device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=pack_fp8_to_int32(
+        layer.weight),
+                                            perm=torch.empty(0,
+                                                             dtype=torch.int,
+                                                             device=device),
+                                            size_k=part_size_k,
+                                            size_n=part_size_n,
+                                            num_bits=8)
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+
+    # WEIGHT SCALES
+    # Currently Marlin doesn't support per-tensor scales, so we
+    # expand it to channelwise
+    scales = layer.weight_scale.repeat(1, part_size_n).to(
+        layer.orig_dtype).to(device)
+    # Permute scales
+    marlin_scales = marlin_permute_scales(s=scales,
+                                          size_k=part_size_k,
+                                          size_n=part_size_n,
+                                          group_size=-1)
+    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+
+
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Repack FP8 weights to gptq format (packed int32 elements)
+    """
+    assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.shape[0] % 4 == 0
+
+    # Reshape to prepare for packing
+    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
+
+    # Convert fp8 to uint8 (byte) representation
+    byte_tensor = reshaped.view(torch.uint8)
+
+    # Pack 4 uint8 values into one int32
+    packed = (byte_tensor[:, 0].to(torch.int32) |
+              (byte_tensor[:, 1].to(torch.int32) << 8) |
+              (byte_tensor[:, 2].to(torch.int32) << 16) |
+              (byte_tensor[:, 3].to(torch.int32) << 24))
+
+    return packed.view(fp8_tensor.shape[0] // 4,
+                       *fp8_tensor.shape[1:]).contiguous()
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
new file mode 100644
index 000000000..1773748a0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -0,0 +1,120 @@
+"""Utility functions used for tests and benchmarks"""
+
+from typing import List
+
+import numpy
+import torch
+
+from .marlin_utils import GPTQ_MARLIN_TILE, marlin_permute_scales
+from .quant_utils import get_pack_factor, quantize_weights, sort_weights
+
+
+class MarlinWorkspace:
+
+    def __init__(self, out_features, min_thread_n, max_parallel):
+        assert (out_features % min_thread_n == 0), (
+            "out_features = {} is undivisible by min_thread_n = {}".format(
+                out_features, min_thread_n))
+
+        max_workspace_size = ((out_features // min_thread_n) * max_parallel)
+
+        self.scratch = torch.zeros(max_workspace_size,
+                                   dtype=torch.int,
+                                   device="cuda")
+
+
+def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+    assert q_w.shape == (size_k, size_n)
+    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
+    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
+
+    # Permute weights to 16x64 marlin tiles
+    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    q_w = q_w.permute((0, 2, 1, 3))
+    q_w = q_w.reshape((size_k // tile, size_n * tile))
+
+    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
+
+    return q_w
+
+
+def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
+                           dtype=numpy.uint32)
+    for i in range(pack_factor):
+        q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
+
+    return q_packed
+
+
+def get_weight_perm(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = numpy.array(perm_list)
+
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
+                    act_order: bool):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
+                                                       act_order)
+
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/vllm/model_executor/layers/quantization/utils/format_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
similarity index 71%
rename from vllm/model_executor/layers/quantization/utils/format_24.py
rename to vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 01c8cf789..648c32249 100644
--- a/vllm/model_executor/layers/quantization/utils/format_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -1,9 +1,14 @@
-#
-# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es).
-#
+"""Utility functions used for tests and benchmarks"""
 
+import random
+from typing import List
+
+import numpy
 import torch
 
+from .marlin_utils_test import marlin_weights
+from .quant_utils import quantize_weights
+
 
 # This is PyTorch implementation of main part of reorder_meta()
 # function, from tools/util/include/cutlass/util/host_reorder.h file
@@ -306,3 +311,155 @@ def mask_creator(tensor):
     mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)
 
     return mask
+
+
+def inject_24(w, size_k, size_n):
+    assert w.shape == (size_k, size_n)
+
+    mask = mask_creator(w.t()).t().cuda().bool()
+
+    return (mask * w).contiguous(), mask.contiguous()
+
+
+def check_24(w, num_rows_to_sample=50, _verbose=False):
+    BLOCK_SIZE = 4
+    MAX_NON_ZEROS = 2
+
+    w = w.t().contiguous()
+
+    print("check_24: w.shape = {}".format(w.shape))
+
+    num_rows, num_cols = w.shape
+    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
+    if _verbose:
+        print(f"Sampled row idxs = {sampled_row_idxs}")
+
+    total_segments = 0
+    non_24_segments = 0
+    for i in sampled_row_idxs:
+        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
+            total_segments += 1
+            block = w[i, j:j + BLOCK_SIZE]
+            num_nonzero = torch.count_nonzero(block)
+            if num_nonzero > MAX_NON_ZEROS:
+                print("i = {} j = {} block = {}".format(i, j, block))
+                non_24_segments += 1
+
+    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
+
+
+def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
+    assert q_24.shape == (size_k, size_n)
+
+    # Remove zp to normalize over 0
+    max_q_val = (1 << num_bits) - 1
+    zp = (max_q_val + 1) // 2
+    q_24_no_zp = q_24 - zp
+
+    # Compress
+    q_24_no_zp = q_24_no_zp.t().contiguous()
+    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
+        q_24_no_zp)
+    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
+
+    # Restore zp
+    q_24_comp = q_24_no_zp_comp + zp
+
+    # Resize meta to its actual shape (without moving any data)
+    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
+
+    return q_24_comp, meta
+
+
+def get_scale_perms_24():
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
+    scale_perm_single: List[int] = []
+    for i in range(8):
+        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
+    return scale_perm, scale_perm_single
+
+
+def get_weight_perm_24(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        col_o = col // 2
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
+                             4 * block)
+        for j in range(4):
+            perm_list.extend([p + 1 * j for p in perm1])
+    perm = numpy.array(perm_list)
+
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
+                             group_size: int) -> torch.Tensor:
+
+    scale_perm, scale_perm_single = get_scale_perms_24()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_24_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Inject 2:4 sparsity
+    w_24, mask_24 = inject_24(w, size_k, size_n)
+
+    # Quantize
+    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
+                                                             num_bits,
+                                                             group_size,
+                                                             act_order=False)
+
+    # Compress quantized weight
+    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
+                                                     num_bits)
+    size_k_comp = size_k // 2
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm_24(num_bits)
+    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
+                                        num_bits, weight_perm)
+    marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
-- 
GitLab


From 52b7fcb35a6f8b57429431e929884c05d8266023 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 11 Jul 2024 09:17:07 -0700
Subject: [PATCH 316/376] Benchmark: add H100 suite (#6047)

---
 .../benchmark-pipeline.yaml                   | 35 +++++++++----------
 .../run-benchmarks-suite.sh                   | 28 ++++++++++++---
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 2b25c954b..02c0ee534 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -11,7 +11,7 @@ steps:
             - sh
             - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
   - wait
-  - label: "A100 Benchmark"
+  - label: "A100"
     agents:
       queue: A100
     plugins:
@@ -42,21 +42,20 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  # - label: "H100: NVIDIA SMI"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       propagate-uid-gid: false
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
+  - label: "H100"
+    agents:
+      queue: H100
+    plugins:
+    - docker#v5.11.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
 
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 021473f76..04b02adf3 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -54,7 +54,7 @@ wait_for_server() {
   # wait for vllm server to start
   # return 1 if vllm server crashes
   timeout 1200 bash -c '
-    until curl localhost:8000/v1/completions; do
+    until curl -X POST localhost:8000/v1/completions; do
       sleep 1
     done' && return 0 || return 1
 }
@@ -73,8 +73,17 @@ kill_gpu_processes() {
       echo "All GPU processes have been killed."
   fi
 
+  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
+  # since we are in container anyway
+  pkill -9 -f python
+  pkill -9 -f python3
+
   # waiting for GPU processes to be fully killed
-  sleep 10
+  # loop while nvidia-smi returns any processes
+  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
+    sleep 1
+    echo "Waiting for GPU processes to be killed"
+  done
 
   # remove vllm config file
   rm -rf ~/.config/vllm
@@ -90,12 +99,19 @@ upload_to_buildkite() {
   # upload the benchmarking results to buildkite
 
   # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    BUILDKITE_AGENT_COMMAND="buildkite-agent"
+  elif [ -f /workspace/buildkite-agent ]; then
+    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+  else
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+
+  # Use the determined command to annotate and upload artifacts
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 
 run_latency_tests() {
@@ -269,6 +285,7 @@ run_serving_tests() {
     echo "Running test case $test_name"
     echo "Server command: $server_command"
     eval "$server_command" &
+    server_pid=$!
 
     # wait until the server is alive
     wait_for_server
@@ -318,6 +335,7 @@ run_serving_tests() {
     done
 
     # clean up
+    kill -9 $server_pid
     kill_gpu_processes
   done
 }
-- 
GitLab


From 1df43de9bb2cceecdc0dc2dc5c650a327aeabe0f Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:21:10 -0700
Subject: [PATCH 317/376] [bug fix] Fix llava next feature size calculation.
 (#6339)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 tests/models/test_llava_next.py          | 14 +++++++++++++-
 vllm/model_executor/models/llava_next.py | 18 ++++++++++--------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 581cbcf90..163741a57 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,8 +1,10 @@
 from typing import List, Optional, Tuple
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
+from vllm.model_executor.models.llava_next import (
+    get_llava_next_image_feature_size)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
@@ -120,3 +122,13 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
             name_0="hf",
             name_1="vllm",
         )
+
+
+@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
+                                                         (183, 488, 776)])
+def test_image_feature_size(height_and_width_and_result):
+    height, width, result = height_and_width_and_result
+    config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+    assert get_llava_next_image_feature_size(config,
+                                             input_height=height,
+                                             input_width=width) == result
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 7e06f1e95..9369ec89f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -74,19 +74,21 @@ def _get_llava_next_num_unpadded_features(
 ) -> Tuple[int, int]:
     current_height = npatches * num_patch_height
     current_width = npatches * num_patch_width
+    current_height = torch.tensor(current_height).to("cuda")
+    current_width = torch.tensor(current_width).to("cuda")
 
     aspect_ratio: float = width / height
     current_aspect_ratio: float = current_width / current_height
     if aspect_ratio > current_aspect_ratio:
-        new_height = (height * current_width) // width
-        if new_height % 2 == 1:
-            new_height += 1
-        current_height = new_height
+        scale_factor = current_width / width
+        new_height = int(height * scale_factor)
+        padding = (current_height - new_height) // 2
+        current_height -= padding * 2
     else:
-        new_width = (width * current_height) // height
-        if new_width % 2 == 1:
-            new_width += 1
-        current_width = new_width
+        scale_factor = current_height / height
+        new_width = int(width * scale_factor)
+        padding = (current_width - new_width) // 2
+        current_width -= padding * 2
 
     unpadded_features = current_height * current_width
     newline_features = current_height
-- 
GitLab


From 2d23b42d9255f724f955a1cf91ed78c983854737 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 11 Jul 2024 11:38:40 -0700
Subject: [PATCH 318/376] [doc] update pipeline parallel in readme (#6347)

---
 README.md             | 2 +-
 docs/source/index.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index cced85f17..dac4b513c 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
 
 - Seamless integration with popular Hugging Face models
 - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-- Tensor parallelism support for distributed inference
+- Tensor parallelism and pipieline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 67c039f25..174d91b8d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,7 +38,7 @@ vLLM is flexible and easy to use with:
 
 * Seamless integration with popular HuggingFace models
 * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-* Tensor parallelism support for distributed inference
+* Tensor parallelism and pipieline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
 * Support NVIDIA GPUs and AMD GPUs
-- 
GitLab


From a4feba929b61f287ae0b7407c3d615c6dec193d6 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 11 Jul 2024 13:28:38 -0700
Subject: [PATCH 319/376] [CI/Build] Add nightly benchmarking for tgi,
 tensorrt-llm and lmdeploy (#5362)

---
 .buildkite/nightly-benchmarks/README.md       |   1 +
 .../nightly-benchmarks/kickoff-pipeline.sh    |  27 ---
 .../nightly-descriptions.md                   |  45 ++++
 .../nightly-benchmarks/nightly-pipeline.yaml  | 120 ++++++++++
 .../nightly-benchmarks/run-nightly-suite.sh   |  76 ++++++
 .../scripts/download-tokenizer.py             |  26 +++
 .../scripts/get-lmdeploy-modelname.py         |   6 +
 .../scripts/launch-trt-server.sh              | 102 ++++++++
 .../scripts/nightly-annotate.sh               |  40 ++++
 .../scripts/plot-nightly-results.py           | 135 +++++++++++
 .../scripts/run-lmdeploy-nightly.sh           | 218 +++++++++++++++++
 .../scripts/run-tgi-nightly.sh                | 216 +++++++++++++++++
 .../scripts/run-trt-nightly.sh                | 214 +++++++++++++++++
 .../scripts/run-vllm-nightly.sh               | 221 ++++++++++++++++++
 .../scripts/summary-nightly-results.py        |  76 ++++++
 .../tests/nightly-tests.json                  | 116 +++++++++
 README.md                                     |   2 +
 benchmarks/benchmark_serving.py               |   9 +
 18 files changed, 1623 insertions(+), 27 deletions(-)
 delete mode 100755 .buildkite/nightly-benchmarks/kickoff-pipeline.sh
 create mode 100644 .buildkite/nightly-benchmarks/nightly-descriptions.md
 create mode 100644 .buildkite/nightly-benchmarks/nightly-pipeline.yaml
 create mode 100644 .buildkite/nightly-benchmarks/run-nightly-suite.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/download-tokenizer.py
 create mode 100644 .buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
 create mode 100644 .buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
 create mode 100644 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
 create mode 100644 .buildkite/nightly-benchmarks/tests/nightly-tests.json

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 4036b32a4..c84e15093 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -1,5 +1,6 @@
 # vLLM benchmark suite
 
+
 ## Introduction
 
 This directory contains the performance benchmarking CI for vllm.
diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
deleted file mode 100755
index 15d411feb..000000000
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
-set -euo pipefail
-
-# Install system packages
-apt update
-apt install -y curl jq
-
-# Install minijinja for templating
-curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
-source $HOME/.cargo/env
-
-# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
-
-  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
-    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
-  else
-    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
-    exit 0
-  fi
-fi
-
-# Upload sample.yaml
-buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
new file mode 100644
index 000000000..c3d3cbf47
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -0,0 +1,45 @@
+
+# Nightly benchmark
+
+The main goal of this benchmarking is two-fold:
+- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
+- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
+
+
+## Docker images
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
+- vllm/vllm-openai:v0.5.0.post1
+- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+- openmmlab/lmdeploy:v0.5.0
+- ghcr.io/huggingface/text-generation-inference:2.1
+
+<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
+
+
+## Hardware
+
+One AWS node with 8x NVIDIA A100 GPUs.
+
+
+## Workload description
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
+
+- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 500 prompts.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
+
+## Plots
+
+In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
+
+<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
+
+## Results
+
+{nightly_results_benchmarking_table}
diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
new file mode 100644
index 000000000..6e399bb93
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,120 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
+steps:
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+  - label: "A100 trt benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+                <<: *common_container_settings
+
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.5.0
+                <<: *common_container_settings
+  
+
+  - label: "A100 vllm benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:latest 
+                <<: *common_container_settings
+
+  - label: "A100 tgi benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: ghcr.io/huggingface/text-generation-inference:2.1 
+                <<: *common_container_settings
+        
+  - wait
+
+  - label: "Plot"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+
+  - wait
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
new file mode 100644
index 000000000..627a3e697
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    if [[ $gpu_count -gt 0 ]]; then
+        echo "GPU found."
+    else
+        echo "Need at least 1 GPU to run benchmarking."
+        exit 1
+    fi
+    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+    echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+    # check if HF_TOKEN is available and valid
+    if [[ -z "$HF_TOKEN" ]]; then
+        echo "Error: HF_TOKEN is not set."
+        exit 1
+    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+        echo "Error: HF_TOKEN does not start with 'hf_'."
+        exit 1
+    else
+        echo "HF_TOKEN is set and valid."
+    fi
+}
+
+main() {
+
+    check_gpus
+    check_hf_token
+
+    df -h
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+    
+
+    # run lmdeploy
+    if which lmdeploy >/dev/null; then
+        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+        exit 0
+    fi
+
+    # run tgi
+    if [ -e /tgi-entrypoint.sh ]; then
+        echo "tgi is available, redirect to run-tgi-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+        exit 0
+    fi
+
+    # run trt
+    if which trtllm-build >/dev/null; then
+        echo "trtllm is available, redirect to run-trt-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+        exit 0
+    fi
+
+    # run vllm
+    if [ -e /vllm-workspace ]; then
+        echo "vllm is available, redirect to run-vllm-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+        exit 0
+    fi
+
+}
+
+main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
new file mode 100644
index 000000000..68ac5909e
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -0,0 +1,26 @@
+import argparse
+
+from transformers import AutoTokenizer
+
+
+def main(model, cachedir):
+    # Load the tokenizer and save it to the specified directory
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer.save_pretrained(cachedir)
+    print(f"Tokenizer saved to {cachedir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download and save Hugging Face tokenizer")
+    parser.add_argument("--model",
+                        type=str,
+                        required=True,
+                        help="Name of the model")
+    parser.add_argument("--cachedir",
+                        type=str,
+                        required=True,
+                        help="Directory to save the tokenizer")
+
+    args = parser.parse_args()
+    main(args.model, args.cachedir)
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
new file mode 100644
index 000000000..18bcc3a87
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -0,0 +1,6 @@
+from lmdeploy.serve.openai.api_client import APIClient
+
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
new file mode 100644
index 000000000..f8262653a
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+
+server_params=$1
+common_params=$2
+
+
+
+model_path=$(echo "$common_params" | jq -r '.model')
+model_name="${model_path#*/}"
+model_type=$(echo "$server_params" | jq -r '.model_type')
+model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+model_tp_size=$(echo "$common_params" | jq -r '.tp')
+max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
+trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+cd ~
+rm -rf models
+mkdir -p models
+cd models
+models_dir=$(pwd)
+trt_model_path=${models_dir}/${model_name}-trt-ckpt
+trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+cd ~
+rm -rf tensorrt-demo
+git clone https://github.com/neuralmagic/tensorrt-demo.git
+cd tensorrt-demo
+tensorrt_demo_dir=$(pwd)
+
+# make sure the parameter inside tensorrt_demo is consistent to envvar
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
+
+
+cd /
+rm -rf tensorrtllm_backend
+git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+git lfs install
+cd tensorrtllm_backend
+git checkout $trt_llm_version
+tensorrtllm_backend_dir=$(pwd)
+git submodule update --init --recursive
+cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
+
+cd /tensorrtllm_backend
+cd ./tensorrt_llm/examples/${model_type}
+
+
+if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+
+    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
+    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
+    python ../quantization/quantize.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path} \
+        --qformat fp8 \
+        --kv_cache_dtype fp8 \
+        --calib_size 2
+
+else
+
+    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
+    python3 convert_checkpoint.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path}
+
+fi
+
+
+
+trtllm-build \
+--checkpoint_dir=${trt_model_path} \
+--gpt_attention_plugin=${model_dtype} \
+--gemm_plugin=${model_dtype} \
+--remove_input_padding=enable \
+--paged_kv_cache=enable \
+--tp_size=${model_tp_size} \
+--max_batch_size=${max_batch_size} \
+--max_input_len=${max_input_len} \
+--max_output_len=${max_output_len} \
+--max_num_tokens=${max_output_len} \
+--opt_num_tokens=${max_output_len} \
+--output_dir=${trt_engine_path} 
+
+cd /tensorrtllm_backend/triton_model_repo
+rm -rf ./tensorrt_llm/1/*
+cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+cd /tensorrtllm_backend
+python3 scripts/launch_triton_server.py \
+--world_size=${model_tp_size} \
+--model_repo=/tensorrtllm_backend/triton_model_repo &
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
new file mode 100644
index 000000000..1168912c6
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+
+main() {
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+
+    if [ ! -f /workspace/buildkite-agent ]; then
+        echo "buildkite-agent binary not found. Skip plotting the results."
+        exit 0
+    fi
+
+    # initial annotation
+    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+
+    # download results
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    mkdir -p results/
+    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
+    ls
+    ls results/
+
+    # generate figures
+    python3 -m pip install tabulate pandas matplotlib
+    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+        --description $description \
+        --results-folder results/
+    
+    # upload results and figures
+    /workspace/buildkite-agent artifact upload "nightly_results.png"
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+}
+
+main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
new file mode 100644
index 000000000..e5cfcc64a
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -0,0 +1,135 @@
+import argparse
+import json
+import math
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file, "r") as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+    with open(args.description, "r") as f:
+        description = f.read()
+
+    description = description.format(
+        nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+    plt.rcParams.update({'font.size': 20})
+
+    # plot results
+    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
+    fig.subplots_adjust(hspace=1)
+    methods = ["vllm", "trt", "lmdeploy", "tgi"]
+    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
+        for j, metric in enumerate(["TTFT", "ITL"]):
+            means, stds = [], []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    means.append(0.)
+                    stds.append(0.)
+                else:
+                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
+                    std = filtered_df[f"Std {metric} (ms)"].values[0]
+                    success = filtered_df["Successful req."].values[0]
+                    stds.append(std / math.sqrt(success))
+
+            print(model, metric)
+            print(means, stds)
+
+            ax = axes[i, j + 1]
+
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                means,
+                yerr=stds,
+                capsize=10,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel(f"{metric} (ms)")
+            ax.set_title(f"{model} {metric}")
+            ax.grid(axis='y')
+
+        metric = "Tput"
+        j = 0
+        if True:
+            tputs = []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    tputs.append(0.)
+                else:
+                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
+                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
+                    tputs.append(input_tput + output_tput)
+
+            print(model, metric)
+            print(tputs)
+
+            ax = axes[i, j]
+
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                tputs,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel("Tput (token/s)")
+            ax.set_title(f"{model} {metric}")
+            ax.grid(axis='y')
+
+    fig.tight_layout()
+    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
new file mode 100644
index 000000000..d6f112aaa
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill lmdeploy || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # append lmdeploy to the test name
+    test_name=lmdeploy_$test_name
+    
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
+    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    # prepare tokenizer
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+
+    server_command="lmdeploy serve api_server $model \
+      --tp $tp \
+      --server-port $port \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "lmdeploy server is up and running."
+    else
+      echo ""
+      echo "lmdeploy failed to start within the timeout period."
+      break
+    fi
+
+    # get model name
+    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend lmdeploy \
+        --tokenizer /tokenizer_cache \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --model \"$model_name\" \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "lmdeploy" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  python -m pip install transformers==4.41.2
+
+  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
new file mode 100644
index 000000000..fed03654f
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill text-generation || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append tgi to the test name
+    test_name=tgi_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
+    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        --quantize fp8 \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        $server_args"
+    fi
+
+
+    
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "tgi server is up and running."
+    else
+      echo ""
+      echo "tgi failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tgi \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "tgi" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=tgi
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
new file mode 100644
index 000000000..4a82b9ec6
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill tritonserver || true
+  # waiting for GPU processes to be fully killed
+  sleep 20
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append trt to the test name
+    test_name=trt_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
+    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+
+    echo "Running test case $test_name"
+    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "trt server is up and running."
+    else
+      echo ""
+      echo "trt failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tensorrt-llm \
+        --tokenizer /tokenizer_cache \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command=""
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "trt" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+
+
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # update transformers package, to make sure mixtral tokenizer is available
+  python -m pip install transformers -U
+
+  export CURRENT_LLM_SERVING_ENGINE=trt
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
new file mode 100644
index 000000000..663045b8a
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append vllm to the test name
+    test_name=vllm_$test_name
+
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
+    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    fi
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend vllm \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "vllm" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=vllm
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+
+  python3 -m pip install tabulate pandas
+  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
new file mode 100644
index 000000000..782d1ef9a
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -0,0 +1,76 @@
+import datetime
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "Successful req.",
+    "request_throughput": "Tput (req/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "std_ttft_ms": "Std TTFT (ms)",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "std_itl_ms": "Std ITL (ms)",
+    "input_throughput": "Input Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "engine": "Engine",
+}
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file, "r") as f:
+            raw_result = json.loads(f.read())
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands"), "r") as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+    serving_results = pd.DataFrame.from_dict(serving_results)
+
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+
+    serving_md_table_with_headers = tabulate(serving_results,
+                                             headers='keys',
+                                             tablefmt='pipe',
+                                             showindex=False)
+    # remove the first line of header
+    serving_md_table_lines = serving_md_table_with_headers.split('\n')
+    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
+
+    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+    # document benchmarking results in markdown
+    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+        # document results with header.
+        # for those who wants to reproduce our benchmark.
+        f.write(serving_md_table_with_headers)
+        f.write('\n')
+
+    # document benchmarking results in json
+    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+
+        results = serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
new file mode 100644
index 000000000..f250833c6
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -0,0 +1,116 @@
+[
+    {
+        "test_name": "llama8B_tp1",
+        "qps_list": [4],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tp": 1,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4",
+        "qps_list": [2],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2],
+        "common_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    }
+]
\ No newline at end of file
diff --git a/README.md b/README.md
index dac4b513c..53285356a 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,8 @@ vLLM is fast with:
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels
 
+**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+
 vLLM is flexible and easy to use with:
 
 - Seamless integration with popular Hugging Face models
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 7ba977141..b2924b9e8 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -60,12 +60,15 @@ class BenchmarkMetrics:
     output_throughput: float
     mean_ttft_ms: float
     median_ttft_ms: float
+    std_ttft_ms: float
     p99_ttft_ms: float
     mean_tpot_ms: float
     median_tpot_ms: float
+    std_tpot_ms: float
     p99_tpot_ms: float
     mean_itl_ms: float
     median_itl_ms: float
+    std_itl_ms: float
     p99_itl_ms: float
 
 
@@ -274,12 +277,15 @@ def calculate_metrics(
         mean_ttft_ms=np.mean(ttfts or 0) *
         1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
         p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
         mean_itl_ms=np.mean(itls or 0) * 1000,
         median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
     )
 
@@ -396,12 +402,15 @@ async def benchmark(
         "output_throughput": metrics.output_throughput,
         "mean_ttft_ms": metrics.mean_ttft_ms,
         "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
         "p99_ttft_ms": metrics.p99_ttft_ms,
         "mean_tpot_ms": metrics.mean_tpot_ms,
         "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
         "p99_tpot_ms": metrics.p99_tpot_ms,
         "mean_itl_ms": metrics.mean_itl_ms,
         "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
         "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
         "output_lens": actual_output_lens,
-- 
GitLab


From 7ed6a4f0e1b39499675edf1dd6079d4bf21eb0fe Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 11 Jul 2024 18:02:29 -0400
Subject: [PATCH 320/376] [ BugFix ] Prompt Logprobs Detokenization (#6223)

Co-authored-by: Zifei Tong <zifeitong@gmail.com>
---
 .buildkite/test-pipeline.yaml               |   5 +-
 tests/tokenization/test_detokenize.py       | 109 ++++++++++++++++----
 vllm/engine/output_processor/single_step.py |  19 +++-
 vllm/transformers_utils/detokenizer.py      |  16 ++-
 4 files changed, 117 insertions(+), 32 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8013fbb64..e09122ba6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -87,7 +87,10 @@ steps:
 
 - label: Engine Test
   mirror_hardwares: [amd]
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
+  commands: 
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization
 
 - label: Entrypoints Test
   mirror_hardwares: [amd]
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 12e5ae85a..f4551ed42 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import Any, Dict, List, Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -139,6 +139,15 @@ def create_dummy_logprobs(
     } for token_id in complete_sequence_token_ids]
 
 
+def create_dummy_prompt_logprobs(
+        complete_sequence_token_ids: List[int]
+) -> List[Optional[Dict[int, Any]]]:
+    # logprob for the first prompt token is None.
+    logprobs: List[Optional[Dict[int, Any]]] = [None]
+    logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:])
+    return logprobs
+
+
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
 @pytest.mark.parametrize("skip_special_tokens", [True, False])
@@ -177,13 +186,10 @@ def test_decode_sequence_logprobs(complete_sequence: str,
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", [True])
-def test_decode_prompt_logprobs(complete_sequence: str,
-                                complete_sequence_token_ids: List[int],
-                                detokenizer: Detokenizer,
-                                skip_special_tokens: bool):
+def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
+                                detokenizer: Detokenizer):
     """Verify Detokenizer decodes prompt logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
+    sampling_params = SamplingParams(skip_special_tokens=True,
                                      prompt_logprobs=1)
 
     # Run sequentially.
@@ -192,19 +198,78 @@ def test_decode_prompt_logprobs(complete_sequence: str,
                               seqs=[seq],
                               sampling_params=sampling_params,
                               arrival_time=0.0)
-    dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
-    detokenizer.decode_prompt_logprobs_inplace(seq_group, dummy_logprobs)
-    decoded_prompt_logprobs = dummy_logprobs
+    dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids)
+    detokenizer.decode_prompt_logprobs_inplace(seq_group,
+                                               dummy_logprobs,
+                                               position_offset=0)
+    # First logprob is None.
+    decoded_prompt_logprobs: List[Dict[int, Any]] = dummy_logprobs[
+        1:]  # type: ignore
 
-    if skip_special_tokens:
-        # Text for logprobs for the chosen token should be the same as the
-        # prompt text. Note that this will only be true if we skip
-        # special tokens.
-        assert complete_sequence == "".join([
-            logprobs[token_id].decoded_token for token_id, logprobs in zip(
-                complete_sequence_token_ids, decoded_prompt_logprobs)
-        ])
-        assert complete_sequence != "".join([
-            logprobs[token_id + 1].decoded_token for token_id, logprobs in zip(
-                complete_sequence_token_ids, decoded_prompt_logprobs)
-        ])
+    # decoded_prompt_logprobs doesn't contain the first token.
+    token_ids = complete_sequence_token_ids
+    tokenzier = detokenizer.get_tokenizer_for_seq(seq)
+    text_full = tokenzier.decode(token_ids, skip_special_tokens=True)
+    text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True)
+    text = text_full[len(text_first):]
+
+    # Text for logprobs for the chosen token should be the same as the
+    # prompt text. Note that the first logprob is None.
+    assert text == "".join([
+        logprobs[token_id].decoded_token
+        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+    ])
+    assert text != "".join([
+        logprobs[token_id + 1].decoded_token
+        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+    ])
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
+def test_decode_prompt_logprobs_chunked_prefill(
+    vllm_runner,
+    model,
+    chunked_prefill_token_size: int,
+    example_prompts,
+):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    with vllm_runner(model,
+                     dtype="half",
+                     max_logprobs=5,
+                     gpu_memory_utilization=0.5,
+                     enable_chunked_prefill=enable_chunked_prefill,
+                     max_num_batched_tokens=max_num_batched_tokens,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+
+        vllm_sampling_params = SamplingParams(max_tokens=10,
+                                              logprobs=5,
+                                              prompt_logprobs=5,
+                                              temperature=0.0)
+        vllm_results = vllm_model.model.generate(
+            example_prompts, sampling_params=vllm_sampling_params)
+
+        for idx, result in enumerate(vllm_results):
+            assert result.prompt_logprobs is not None
+            assert result.prompt_logprobs[0] is None
+
+            # Compared detokenized prompts ids to original prompt.
+            generated_string = ""
+            for (prompt_token,
+                 prompt_logprobs) in zip(result.prompt_token_ids[1:],
+                                         result.prompt_logprobs[1:]):
+                # prompt_logprobs is a dict of the token_id: logprob
+                # We select the token_id corresponding to the actual prompt
+                # Decoded token in the detokenized string corresponding to this
+                # prompt token.
+                generated_string += prompt_logprobs[prompt_token].decoded_token
+
+            assert generated_string == example_prompts[idx], (
+                "Detokenized prompt logprobs do not match original prompt")
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index fa672e1fe..4851897dd 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -60,14 +60,23 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
         assert len(outputs) == 1, ("Single step should only has 1 output.")
         output = outputs[0]
         prompt_logprobs = output.prompt_logprobs
+
+        # If this is the first (or only) "chunk" of the prefill, we need
+        # to prepend None to the list of prompt logprobs. The reason for this
+        # is that for N prompt tokens, the Sampler will generate N-1 total
+        # prompt logprobs during prefill since the token at idx 0 will not
+        # have a logprob associated with it.
         if prompt_logprobs is not None:
+            if not seq_group.prompt_logprobs:
+                prompt_logprobs = [None] + prompt_logprobs
+                seq_group.prompt_logprobs = []
+
             if seq_group.sampling_params.detokenize and self.detokenizer:
                 self.detokenizer.decode_prompt_logprobs_inplace(
-                    seq_group, prompt_logprobs)
-            if not seq_group.prompt_logprobs:
-                # The first prompt token's logprob is None because it doesn't
-                # have tokens that are precedent.
-                seq_group.prompt_logprobs = [None]
+                    seq_group,
+                    prompt_logprobs,
+                    position_offset=len(seq_group.prompt_logprobs))
+
             seq_group.prompt_logprobs.extend(prompt_logprobs)
 
     def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index e8e53f494..cc9a97130 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -21,14 +21,17 @@ class Detokenizer:
         """Returns the HF tokenizer to use for a given sequence."""
         return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
 
-    def decode_prompt_logprobs_inplace(
-            self, seq_group: SequenceGroup,
-            prompt_logprobs: List[Optional[Dict[int, Logprob]]]) -> None:
+    def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
+                                       prompt_logprobs: List[Optional[Dict[
+                                           int, Logprob]]],
+                                       position_offset: int) -> None:
         """Decodes the logprobs for the prompt of a sequence group.
 
         Args:
             seq_group: The sequence group to decode.
             prompt_logprobs: The logprobs to decode.
+            position_offset: Offset of the first index of the logprobs 
+                relative to the start of the sequence (for chunked prefill).
         
         Returns:
             The prompt logprobs with the decoded tokens.
@@ -47,8 +50,13 @@ class Detokenizer:
         next_iter_tokens: List[str] = []
         prev_tokens = None
 
-        for token_position, prompt_logprobs_for_token in enumerate(
+        for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
                 prompt_logprobs):
+
+            # Absolute token position equals the index in the logprobs
+            # list plus the offset of the entire logprobs list relative
+            # to the start of the sequence.
+            token_position = token_position_in_logprob + position_offset
             if not prompt_logprobs_for_token:
                 continue
             for token_id, sample_logprob in prompt_logprobs_for_token.items():
-- 
GitLab


From d6ab5289976fe219f943a7df4fb3a0ba1cb31a00 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Thu, 11 Jul 2024 18:32:06 -0700
Subject: [PATCH 321/376] [Misc] Remove flashinfer warning, add flashinfer
 tests to CI (#6351)

---
 .buildkite/test-pipeline.yaml                     | 8 +++++---
 tests/basic_correctness/test_basic_correctness.py | 5 +++++
 vllm/attention/selector.py                        | 3 ---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e09122ba6..3e22310ea 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -19,8 +19,10 @@ steps:
 - label: Basic Correctness Test
   mirror_hardwares: [amd]
   commands:
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASHINFER pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
@@ -122,14 +124,14 @@ steps:
 - label: Kernels Test %N
   #mirror_hardwares: [amd]
   commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test
@@ -240,7 +242,7 @@ steps:
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index a7b0fef53..d3e74a4f8 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -2,11 +2,13 @@
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
+import os
 import weakref
 
 import pytest
 
 from vllm import LLM
+from vllm.utils import is_hip
 
 from ..models.utils import check_outputs_equal
 
@@ -26,6 +28,9 @@ def test_vllm_gc_ed():
     assert weak_llm() is None
 
 
+@pytest.mark.skipif(is_hip()
+                    and os.getenv("VLLM_ATTENTION_BACKEND") == "FLASHINFER",
+                    reason="Flashinfer does not support ROCm/HIP.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index ae63eb1d4..084100f6c 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -77,9 +77,6 @@ def get_attn_backend(
         return IpexAttnBackend
     elif backend == _Backend.FLASHINFER:
         logger.info("Using Flashinfer backend.")
-        logger.warning(("Flashinfer will be stuck on llama-2-7b,"
-                        " please avoid using Flashinfer as the "
-                        "backend when running on llama-2-7b."))
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
     elif backend == _Backend.PALLAS:
-- 
GitLab


From 2b0fb534813e9835077403723a484b7c03d47259 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 11 Jul 2024 19:35:17 -0700
Subject: [PATCH 322/376] [distributed][misc] be consistent with pytorch for
 libcudart.so (#6346)

[distributed][misc] keep consistent with how pytorch finds libcudart.so (#6346)
---
 .../device_communicators/cuda_wrapper.py      | 27 ++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 24308235c..5cac3c1d5 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -4,6 +4,9 @@ convenient for use when we just need to call a few functions.
 """
 
 import ctypes
+import glob
+import os
+import sys
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
@@ -33,6 +36,26 @@ class Function:
     argtypes: List[Any]
 
 
+def get_pytorch_default_cudart_library_path() -> str:
+    # code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa
+    lib_folder = "cuda_runtime"
+    lib_name = "libcudart.so.*[0-9]"
+    lib_path = None
+    for path in sys.path:
+        nvidia_path = os.path.join(path, "nvidia")
+        if not os.path.exists(nvidia_path):
+            continue
+        candidate_lib_paths = glob.glob(
+            os.path.join(nvidia_path, lib_folder, "lib", lib_name))
+        if candidate_lib_paths and not lib_path:
+            lib_path = candidate_lib_paths[0]
+        if lib_path:
+            break
+    if not lib_path:
+        raise ValueError(f"{lib_name} not found in the system path {sys.path}")
+    return lib_path
+
+
 class CudaRTLibrary:
     exported_functions = [
         # ​cudaError_t cudaSetDevice ( int  device )
@@ -77,9 +100,7 @@ class CudaRTLibrary:
 
     def __init__(self, so_file: Optional[str] = None):
         if so_file is None:
-            assert torch.version.cuda is not None
-            major_version = torch.version.cuda.split(".")[0]
-            so_file = f"libcudart.so.{major_version}"
+            so_file = get_pytorch_default_cudart_library_path()
         if so_file not in CudaRTLibrary.path_to_library_cache:
             lib = ctypes.CDLL(so_file)
             CudaRTLibrary.path_to_library_cache[so_file] = lib
-- 
GitLab


From adf32e0a0f58edafe0c71b5c235848a487be2a71 Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Fri, 12 Jul 2024 04:47:00 +0200
Subject: [PATCH 323/376] [Bugfix] Fix usage stats logging exception warning
 with OpenVINO (#6349)

---
 requirements-openvino.txt | 2 +-
 vllm/engine/llm_engine.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index e555d5257..e32c76fb0 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -4,6 +4,6 @@
 # OpenVINO dependencies
 torch >= 2.1.2
 openvino ~= 2024.3.0.dev
-optimum-intel[openvino] >= 1.17.2
+optimum-intel[openvino] >= 1.18.1
 
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d354218cf..622221d2d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -284,7 +284,7 @@ class LLMEngine:
                     "quantization":
                     model_config.quantization,
                     "kv_cache_dtype":
-                    cache_config.cache_dtype,
+                    str(cache_config.cache_dtype),
 
                     # Feature flags
                     "enable_lora":
-- 
GitLab


From d59eb98489103877e9476ef5263305aa3e3f9e23 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 11 Jul 2024 22:47:17 -0400
Subject: [PATCH 324/376] [Model][Phi3-Small] Remove scipy from
 blocksparse_attention (#6343)

---
 .../ops/blocksparse_attention/utils.py        | 35 ++++++++++++++-----
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index b1808970d..78d752230 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -4,16 +4,35 @@
 
 from functools import lru_cache
 
+import numpy as np
 import torch
 import triton
 
-try:
-    from scipy import sparse
-except ImportError as err:
-    raise ImportError("Please install scipy via "
-                      "`pip install scipy` to use "
-                      "BlockSparseAttention in "
-                      "models such as Phi-3.") from err
+
+class csr_matrix:
+    """Simple implementation of CSR matrix conversion without scipy.
+    This replaced scipy.sparse.csr_matrix() previously used."""
+
+    def __init__(self, input_array):
+        if not isinstance(input_array, np.ndarray):
+            raise ValueError("Input must be a NumPy array")
+
+        self.shape = input_array.shape
+        rows, cols = self.shape
+        data = []
+        indices = []
+        indptr = [0]
+
+        for i in range(rows):
+            for j in range(cols):
+                if input_array[i, j]:
+                    data.append(input_array[i, j])
+                    indices.append(j)
+            indptr.append(len(indices))
+
+        self.data = np.array(data)
+        self.indices = np.array(indices)
+        self.indptr = np.array(indptr)
 
 
 def dense_to_crow_col(x: torch.Tensor):
@@ -26,7 +45,7 @@ def dense_to_crow_col(x: torch.Tensor):
     assert x.dim() in (2, 3)
     if x.dim() == 2:
         x = x[None]
-    x = [sparse.csr_matrix(xi.bool().cpu().numpy()) for xi in x]
+    x = [csr_matrix(xi.bool().cpu().numpy()) for xi in x]
     crows = torch.vstack([torch.from_numpy(xi.indptr) for xi in x])
     cols = [torch.from_numpy(xi.indices) for xi in x]
     max_cols = max(len(xi) for xi in cols)
-- 
GitLab


From d26a8b3f1fb53ec35008a83690ca7339c13ecd4e Mon Sep 17 00:00:00 2001
From: adityagoel14 <aditya.goel@amd.com>
Date: Fri, 12 Jul 2024 00:26:26 -0400
Subject: [PATCH 325/376] [CI/Build] (2/2) Switching AMD CI to store images in
 Docker Hub (#6350)

---
 .buildkite/run-amd-test.sh | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index bde8ab618..5f5600437 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -45,15 +45,10 @@ while true; do
         fi
 done
 
-echo "--- Building container"
-sha=$(git rev-parse --short HEAD)
-image_name=rocm_${sha}
-container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
-docker build \
-        -t ${image_name} \
-        -f Dockerfile.rocm \
-        --progress plain \
-        .
+echo "--- Pulling container" 
+image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull ${image_name}
 
 remove_docker_container() {
    docker rm -f ${container_name} || docker image rm -f ${image_name} || true
-- 
GitLab


From b6c16cf8ff8d558ec943f1f17342c2c081f3f5af Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Fri, 12 Jul 2024 00:30:46 -0400
Subject: [PATCH 326/376] [ROCm][AMD] unify CUDA_VISIBLE_DEVICES usage in
 cuda/rocm (#6352)

---
 Dockerfile.rocm                 | 14 +++++++-------
 tests/distributed/test_utils.py |  7 +------
 vllm/config.py                  |  9 +--------
 vllm/utils.py                   |  4 ----
 vllm/worker/worker_base.py      | 10 +---------
 5 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 1b89b892b..befb0499f 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -52,25 +52,25 @@ RUN pip install --upgrade pip
 # Remove sccache so it doesn't interfere with ccache
 # TODO: implement sccache support across components
 RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.4.0 on ROCm
+# Install torch == 2.5.0 on ROCm
 RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
         *"rocm-5.7"*) \
             pip uninstall -y torch torchaudio torchvision \
             && pip install --no-cache-dir --pre \
-                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
-                torchvision==0.19.0.dev20240612 \
+                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
                --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
         *"rocm-6.0"*) \
             pip uninstall -y torch torchaudio torchvision \
             && pip install --no-cache-dir --pre \
-                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
-                torchvision==0.19.0.dev20240612 \
+                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
         *"rocm-6.1"*) \
             pip uninstall -y torch torchaudio torchvision \
             && pip install --no-cache-dir --pre \
-                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
-                torchvision==0.19.0.dev20240612 \
+                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
         *) ;; esac
 
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 9ff11b0d2..a51a9909f 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,7 +1,7 @@
 import ray
 
 import vllm.envs as envs
-from vllm.utils import (cuda_device_count_stateless, is_hip,
+from vllm.utils import (cuda_device_count_stateless,
                         update_environment_variables)
 
 
@@ -22,11 +22,6 @@ class _CUDADeviceCountStatelessTestActor:
 def test_cuda_device_count_stateless():
     """Test that cuda_device_count_stateless changes return value if
     CUDA_VISIBLE_DEVICES is changed."""
-    if is_hip():
-        # Set HIP_VISIBLE_DEVICES == CUDA_VISIBLE_DEVICES. Conversion
-        # is handled by `update_environment_variables`
-        update_environment_variables(
-            {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
     actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
         num_gpus=2).remote()
     assert sorted(ray.get(
diff --git a/vllm/config.py b/vllm/config.py
index d333a042f..de7bb3943 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Union
 import torch
 from transformers import PretrainedConfig
 
-import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
@@ -14,7 +13,7 @@ from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
                         is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
-                        print_warning_once, update_environment_variables)
+                        print_warning_once)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -695,12 +694,6 @@ class ParallelConfig:
             self.distributed_executor_backend = backend
             logger.info("Defaulting to use %s for distributed inference",
                         backend)
-        # If CUDA_VISIBLE_DEVICES is set on ROCm prior to vLLM init,
-        # propagate changes to HIP_VISIBLE_DEVICES (conversion handled by
-        # the update_environment_variables function)
-        if is_hip() and envs.CUDA_VISIBLE_DEVICES:
-            update_environment_variables(
-                {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
 
         self._verify_args()
         self.rank = 0
diff --git a/vllm/utils.py b/vllm/utils.py
index a3d15d797..8be152823 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -386,10 +386,6 @@ def get_open_port() -> int:
 
 
 def update_environment_variables(envs: Dict[str, str]):
-    if is_hip() and "CUDA_VISIBLE_DEVICES" in envs:
-        # Propagate changes to CUDA_VISIBLE_DEVICES to
-        # ROCm's HIP_VISIBLE_DEVICES as well
-        envs["HIP_VISIBLE_DEVICES"] = envs["CUDA_VISIBLE_DEVICES"]
     for k, v in envs.items():
         if k in os.environ and os.environ[k] != v:
             logger.warning(
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index b082f4534..93ffea910 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -11,7 +11,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SamplerOutput)
-from vllm.utils import (enable_trace_function_call_for_thread, is_hip,
+from vllm.utils import (enable_trace_function_call_for_thread,
                         update_environment_variables)
 from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
 
@@ -309,14 +309,6 @@ class WorkerWrapperBase:
             # overwriting CUDA_VISIBLE_DEVICES is desired behavior
             # suppress the warning in `update_environment_variables`
             del os.environ[key]
-            if is_hip():
-                hip_env_var = "HIP_VISIBLE_DEVICES"
-                if hip_env_var in os.environ:
-                    logger.warning(
-                        "Ignoring pre-set environment variable `%s=%s` as "
-                        "%s has also been set, which takes precedence.",
-                        hip_env_var, os.environ[hip_env_var], key)
-                os.environ.pop(hip_env_var, None)
         update_environment_variables(envs)
 
     def init_worker(self, *args, **kwargs):
-- 
GitLab


From 6047187cd854eef114bd70c76469d5a839a07ef4 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 12 Jul 2024 01:06:09 -0400
Subject: [PATCH 327/376] [ Misc ] Remove separate bias add (#6353)

---
 vllm/model_executor/layers/linear.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 7100fe142..bc07d2b83 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -99,15 +99,7 @@ class LinearMethodBase(QuantizeMethodBase):
 
 
 class UnquantizedLinearMethod(LinearMethodBase):
-    """Linear method without quantization.
-
-    Args:
-        separate_bias_add: If true, add bias separately after matrix
-                           multiplication.
-    """
-
-    def __init__(self, separate_bias_add: bool = False):
-        self.separate_bias_add = separate_bias_add
+    """Linear method without quantization."""
 
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
@@ -126,12 +118,8 @@ class UnquantizedLinearMethod(LinearMethodBase):
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        weight = layer.weight
-        if self.separate_bias_add:
-            if bias is not None:
-                return F.linear(x, weight) + bias
-            return F.linear(x, weight)
-        return F.linear(x, weight, bias)
+
+        return F.linear(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):
-- 
GitLab


From f7160d946a0a07703e72d81ba9ecf3913f192605 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 12 Jul 2024 01:40:07 -0700
Subject: [PATCH 328/376] [Misc][Bugfix] Update transformers for tokenizer
 issue (#6364)

---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index b750f9a1b..829e9a285 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.42.0  # Required for Gemma 2 and for additional chat template parameters.
+transformers >= 4.42.4  # Required for Gemma 2 and for additional chat template parameters.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
-- 
GitLab


From aea19f0989667968465087338b568694f61c6391 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 12 Jul 2024 11:11:29 -0400
Subject: [PATCH 329/376] [ Misc ] Support Models With Bias in
 `compressed-tensors` integration (#6356)

---
 .../Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml  | 11 +++++++++++
 .../Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml | 11 +++++++++++
 .buildkite/lm-eval-harness/configs/models-small.txt   |  1 +
 .../lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh  |  2 +-
 tests/models/test_compressed_tensors.py               |  3 +++
 .../compressed_tensors/compressed_tensors.py          |  5 +----
 .../schemes/compressed_tensors_scheme.py              |  7 +++++--
 .../schemes/compressed_tensors_unquantized.py         |  9 +++++----
 .../schemes/compressed_tensors_w4a16_24.py            |  8 +++++++-
 .../schemes/compressed_tensors_w8a8_int8.py           |  9 ++++++---
 .../schemes/compressed_tensors_wNa16.py               |  7 +++++--
 .../layers/quantization/utils/w8a8_utils.py           |  6 ++----
 12 files changed, 58 insertions(+), 21 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml

diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 000000000..43ff2bc5c
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.593
+  - name: "exact_match,flexible-extract"
+    value: 0.588
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
new file mode 100644
index 000000000..259799ba8
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.595
+  - name: "exact_match,flexible-extract"
+    value: 0.582
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 3300ca64f..3d1306f6b 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -2,3 +2,4 @@ Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 933733e9c..d68c6993e 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray" \
   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
   --batch_size $BATCH_SIZE
diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
index 9a0054c5a..da47d5f3f 100644
--- a/tests/models/test_compressed_tensors.py
+++ b/tests/models/test_compressed_tensors.py
@@ -12,7 +12,10 @@ from tests.quantization.utils import is_quant_method_supported
 from .utils import check_logprobs_close
 
 MODELS = [
+    # No bias
     "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
+    # Bias
+    "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 ]
 
 MAX_TOKENS = 32
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index c711fd14c..524b4c894 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -267,10 +267,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
 
         """
 
-        if bias is not None:
-            raise ValueError("bias is not supported for this linear method")
-
         scheme = layer.scheme
         if scheme is None:
             raise ValueError("A scheme must be defined for each layer")
-        return scheme.apply_weights(layer, x)
+        return scheme.apply_weights(layer, x, bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index 119f6cd91..3aa913078 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import Optional
 
 import torch
 
@@ -20,14 +21,16 @@ class CompressedTensorsScheme(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]):
         """
         Run the forward pass for the particular scheme. This is where 
         scheme-specific dequant/quant steps/kernels should be applied.
 
-        :param layer: toch.nn.Module with the registered weights and 
+        :param layer: torch.nn.Module with the registered weights and 
             other parameters relevant to the particular scheme. 
         :param x: input to the layer
+        :param bias: bias parameter
 
         """
         raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index f5911bc3d..2c7fe3e0e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -1,4 +1,4 @@
-from typing import Callable, List
+from typing import Callable, List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -37,6 +37,7 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme):
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, {"weight_loader": weight_loader})
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
-        weight = layer.weight
-        return F.linear(x, weight)
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
+        return F.linear(x, layer.weight, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 3c07d6b6f..54bf85c09 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -118,7 +118,9 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
                               requires_grad=False)
         layer.workspace = workspace
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
         qweight = layer.weight_packed
         meta = layer.meta
         scales = layer.scale_packed
@@ -135,4 +137,8 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
                                             size_n, size_k)
 
         output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
         return output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index e70504ec5..6fec5d010 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,4 +1,4 @@
-from typing import Callable, List
+from typing import Callable, List, Optional
 
 import torch
 from torch.nn import Parameter
@@ -78,8 +78,11 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
                                                   **layer_kwargs)
             layer.register_parameter("input_scale", scale)
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
         return apply_int8_linear(input=x,
                                  weight=layer.weight,
                                  weight_scale=layer.weight_scale,
-                                 input_scale=layer.input_scale)
+                                 input_scale=layer.input_scale,
+                                 bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index ed9fa73c1..187a3f987 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -148,7 +148,9 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
             group_size=layer.group_size)
         replace_tensor(layer, "weight_scale", marlin_scales)
 
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+
         return apply_marlin_linear(
             input=x,
             weight=layer.weight_packed,
@@ -159,4 +161,5 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
             num_bits=self.num_bits,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
-            is_k_full=True)
+            is_k_full=True,
+            bias=bias)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 81b7fdb78..30a82e1b5 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -148,9 +148,6 @@ def apply_int8_linear(
     input_scale: torch.Tensor,
     bias: Optional[torch.Tensor] = None,
 ):
-    if bias is not None:
-        raise NotImplementedError("W8A8 with int8 does not yet support bias.")
-
     # ops.scaled_int8_quant supports both dynamic and static quant.
     # * dynamic, layer.input_scale is None and x_scale computed from x.
     # * static, layer.input_scale is scalar and x_scale is input_scale.
@@ -160,4 +157,5 @@ def apply_int8_linear(
                                  weight,
                                  scale_a=x_scale,
                                  scale_b=weight_scale,
-                                 out_dtype=input.dtype)
+                                 out_dtype=input.dtype,
+                                 bias=bias)
-- 
GitLab


From 024ad87cdc00bb44beed409dbd90c4490284d73e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 12 Jul 2024 23:22:18 +0800
Subject: [PATCH 330/376] [Bugfix] Fix dtype mismatch in PaliGemma (#6367)

---
 tests/models/test_paligemma.py          |  2 +-
 vllm/model_executor/models/gemma.py     |  1 +
 vllm/model_executor/models/paligemma.py | 14 ++++++++++----
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py
index 2b1d3c5b4..b0e7264e8 100644
--- a/tests/models/test_paligemma.py
+++ b/tests/models/test_paligemma.py
@@ -129,7 +129,7 @@ def run_test(
         [0.25, 0.5, 1.0],
     ],
 )
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["float", "half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 16548c6c1..7e0888b5f 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -277,6 +277,7 @@ class GemmaModel(nn.Module):
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 2af2bedd8..8a2bacbd9 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -19,7 +19,7 @@ from vllm.model_executor.models.gemma import GemmaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_tokenizer
-from vllm.sequence import SamplerOutput, SequenceData
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
 
 from .interfaces import SupportsVision
 from .utils import merge_vision_embeddings
@@ -111,7 +111,7 @@ def input_processor_for_paligemma(ctx: InputContext, llm_inputs: LLMInputs):
     orig_prompt = llm_inputs.get("prompt")
     orig_prompt_ids = llm_inputs.get("prompt_token_ids")
 
-    if image_token_str in orig_prompt:
+    if orig_prompt is not None and image_token_str in orig_prompt:
         logger.warning(
             "The image token '%s' was detected in the prompt and "
             "will be removed. Please follow the proper prompt format"
@@ -214,7 +214,9 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsVision):
     def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
                                   pixel_values: torch.Tensor) -> torch.Tensor:
 
-        image_outputs = vision_tower(pixel_values, output_hidden_states=True)
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_outputs = vision_tower(pixel_values.to(dtype=target_dtype),
+                                     output_hidden_states=True)
 
         selected_image_features = image_outputs.last_hidden_state
 
@@ -236,9 +238,12 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsVision):
 
         return self.multi_modal_projector(image_features)
 
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
                 **kwargs: object) -> SamplerOutput:
 
         parsed_image_input = self._parse_and_validate_image_input(**kwargs)
@@ -263,6 +268,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsVision):
                                             positions,
                                             kv_caches,
                                             attn_metadata,
+                                            None,
                                             inputs_embeds=inputs_embeds)
 
         return hidden_states
-- 
GitLab


From f9d25c251907bc8be9b9c712adaf8d3ce631226e Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Fri, 12 Jul 2024 11:42:24 -0500
Subject: [PATCH 331/376] [Build/CI] Checking/Waiting for the GPU's clean state
 (#6379)

---
 .buildkite/run-amd-test.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 5f5600437..363bc07fc 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -2,6 +2,15 @@
 set -ex
 
 # Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
 echo "--- ROCm info"
 rocminfo
 
-- 
GitLab


From b039cbbce37c1e6a1fdb284a41bd27cfef1b3783 Mon Sep 17 00:00:00 2001
From: Yihuan Bu <88394319+kevinbu233@users.noreply.github.com>
Date: Fri, 12 Jul 2024 12:55:39 -0400
Subject: [PATCH 332/376] [Misc] add fixture to guided processor tests (#6341)

---
 tests/entrypoints/openai/conftest.py          |  69 ++++++++++
 tests/entrypoints/openai/test_chat.py         | 119 ++++++------------
 tests/entrypoints/openai/test_completion.py   |  97 ++++----------
 .../openai/test_guided_processors.py          |  65 ++--------
 4 files changed, 144 insertions(+), 206 deletions(-)
 create mode 100644 tests/entrypoints/openai/conftest.py

diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py
new file mode 100644
index 000000000..0837644f2
--- /dev/null
+++ b/tests/entrypoints/openai/conftest.py
@@ -0,0 +1,69 @@
+import pytest
+
+
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
+
+
+@pytest.fixture
+def sample_guided_choice():
+    return [
+        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
+        "Ruby", "Swift", "Kotlin"
+    ]
+
+
+@pytest.fixture
+def sample_sql_statements():
+    return ("""
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+""")
\ No newline at end of file
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 3e80214f2..d6df82694 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -22,53 +22,6 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-TEST_SCHEMA = {
-    "type": "object",
-    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
-        "skills": {
-            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
-        },
-        "work history": {
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "string"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
-                },
-                "required": ["company", "position"]
-            }
-        }
-    },
-    "required": ["name", "age", "skills", "work history"]
-}
-
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-TEST_CHOICE = [
-    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
-    "Swift", "Kotlin"
-]
-
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -408,7 +361,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  guided_decoding_backend: str):
+                                  guided_decoding_backend: str,
+                                  sample_guided_choice):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -422,10 +376,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
+        extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice1 = chat_completion.choices[0].message.content
-    assert choice1 in TEST_CHOICE
+    assert choice1 in sample_guided_choice
 
     messages.append({"role": "assistant", "content": choice1})
     messages.append({
@@ -436,10 +390,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
+        extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice2 = chat_completion.choices[0].message.content
-    assert choice2 in TEST_CHOICE
+    assert choice2 in sample_guided_choice
     assert choice1 != choice2
 
 
@@ -447,7 +401,8 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_json_chat(client: openai.AsyncOpenAI,
-                                guided_decoding_backend: str):
+                                guided_decoding_backend: str,
+                                sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -456,18 +411,18 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
         "user",
         "content":
         f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
+        f"fits this schema: {sample_json_schema}"
     }]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_tokens=1000,
-        extra_body=dict(guided_json=TEST_SCHEMA,
+        extra_body=dict(guided_json=sample_json_schema,
                         guided_decoding_backend=guided_decoding_backend))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
-    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
 
     messages.append({"role": "assistant", "content": message.content})
     messages.append({
@@ -480,12 +435,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_tokens=1000,
-        extra_body=dict(guided_json=TEST_SCHEMA,
+        extra_body=dict(guided_json=sample_json_schema,
                         guided_decoding_backend=guided_decoding_backend))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
-    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
     assert json1["name"] != json2["name"]
     assert json1["age"] != json2["age"]
 
@@ -494,7 +449,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 guided_decoding_backend: str):
+                                 guided_decoding_backend: str, sample_regex):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -502,17 +457,17 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         "role":
         "user",
         "content":
-        f"Give an example IP address with this regex: {TEST_REGEX}"
+        f"Give an example IP address with this regex: {sample_regex}"
     }]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
+        extra_body=dict(guided_regex=sample_regex,
                         guided_decoding_backend=guided_decoding_backend))
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
-    assert re.fullmatch(TEST_REGEX, ip1) is not None
+    assert re.fullmatch(sample_regex, ip1) is not None
 
     messages.append({"role": "assistant", "content": ip1})
     messages.append({"role": "user", "content": "Give me a different one"})
@@ -520,11 +475,11 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
+        extra_body=dict(guided_regex=sample_regex,
                         guided_decoding_backend=guided_decoding_backend))
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
-    assert re.fullmatch(TEST_REGEX, ip2) is not None
+    assert re.fullmatch(sample_regex, ip2) is not None
     assert ip1 != ip2
 
 
@@ -553,7 +508,8 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           guided_decoding_backend: str):
+                                           guided_decoding_backend: str,
+                                           sample_guided_choice):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -569,7 +525,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
         max_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=TEST_CHOICE,
+        extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
 
     assert chat_completion.choices[0].logprobs is not None
@@ -585,7 +541,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_named_tool_use(client: openai.AsyncOpenAI,
-                              guided_decoding_backend: str):
+                              guided_decoding_backend: str,
+                              sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -594,7 +551,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
         "user",
         "content":
         f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
+        f"fits this schema: {sample_json_schema}"
     }]
 
     # non-streaming
@@ -608,7 +565,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
             "function": {
                 "name": "dummy_function_name",
                 "description": "This is a dummy function",
-                "parameters": TEST_SCHEMA
+                "parameters": sample_json_schema
             }
         }],
         tool_choice={
@@ -621,7 +578,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
     assert len(message.content) == 0
     json_string = message.tool_calls[0].function.arguments
     json1 = json.loads(json_string)
-    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
 
     messages.append({"role": "assistant", "content": json_string})
     messages.append({
@@ -642,7 +599,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
             "function": {
                 "name": "dummy_function_name",
                 "description": "This is a dummy function",
-                "parameters": TEST_SCHEMA
+                "parameters": sample_json_schema
             }
         }],
         tool_choice={
@@ -667,7 +624,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
     # finish reason should only return in last block
     assert finish_reason_count == 1
     json2 = json.loads("".join(output))
-    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
     assert json1["name"] != json2["name"]
     assert json1["age"] != json2["age"]
 
@@ -675,7 +632,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_required_tool_use_not_yet_supported(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str):
+        client: openai.AsyncOpenAI, guided_decoding_backend: str,
+        sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -684,7 +642,7 @@ async def test_required_tool_use_not_yet_supported(
         "user",
         "content":
         f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
+        f"fits this schema: {sample_json_schema}"
     }]
 
     with pytest.raises(openai.BadRequestError):
@@ -697,7 +655,7 @@ async def test_required_tool_use_not_yet_supported(
                 "function": {
                     "name": "dummy_function_name",
                     "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
+                    "parameters": sample_json_schema
                 }
             }],
             tool_choice="required")
@@ -712,7 +670,7 @@ async def test_required_tool_use_not_yet_supported(
                 "function": {
                     "name": "dummy_function_name",
                     "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
+                    "parameters": sample_json_schema
                 }
             }],
             tool_choice="auto")
@@ -720,8 +678,9 @@ async def test_required_tool_use_not_yet_supported(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_inconsistent_tool_choice_and_tools(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str):
+async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+                                                  guided_decoding_backend: str,
+                                                  sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -730,7 +689,7 @@ async def test_inconsistent_tool_choice_and_tools(
         "user",
         "content":
         f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
+        f"fits this schema: {sample_json_schema}"
     }]
 
     with pytest.raises(openai.BadRequestError):
@@ -755,7 +714,7 @@ async def test_inconsistent_tool_choice_and_tools(
                 "function": {
                     "name": "dummy_function_name",
                     "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
+                    "parameters": sample_json_schema
                 }
             }],
             tool_choice={
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 52a848b78..d222981d2 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -24,53 +24,6 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-TEST_SCHEMA = {
-    "type": "object",
-    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
-        "skills": {
-            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
-        },
-        "work history": {
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "string"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
-                },
-                "required": ["company", "position"]
-            }
-        }
-    },
-    "required": ["name", "age", "skills", "work history"]
-}
-
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-TEST_CHOICE = [
-    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
-    "Swift", "Kotlin"
-]
-
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -529,77 +482,71 @@ async def test_logits_bias(client: openai.AsyncOpenAI):
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str):
+                                      guided_decoding_backend: str,
+                                      sample_json_schema):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {TEST_SCHEMA}",
+        f"that fits this schema: {sample_json_schema}",
         n=3,
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_json=TEST_SCHEMA,
+        extra_body=dict(guided_json=sample_json_schema,
                         guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
     assert len(completion.choices) == 3
     for i in range(3):
         output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str):
+                                       guided_decoding_backend: str,
+                                       sample_regex):
     completion = await client.completions.create(
         model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
+        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
         n=3,
         temperature=1.0,
         max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
+        extra_body=dict(guided_regex=sample_regex,
                         guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
     assert len(completion.choices) == 3
     for i in range(3):
-        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
+        assert re.fullmatch(sample_regex,
+                            completion.choices[i].text) is not None
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str):
+                                        guided_decoding_backend: str,
+                                        sample_guided_choice):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt="The best language for type-safe systems programming is ",
         n=2,
         temperature=1.0,
         max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
+        extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
     assert len(completion.choices) == 2
     for i in range(2):
-        assert completion.choices[i].text in TEST_CHOICE
+        assert completion.choices[i].text in sample_guided_choice
 
 
 @pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI):
-    simple_sql_grammar = """
-start: select_statement
-
-select_statement: "SELECT" column "from" table "where" condition
-
-column: "col_1" | "col_2"
-table: "table_1" | "table_2"
-condition: column "=" number
-
-number: "1" | "2"
-"""
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+                              sample_sql_statements):
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -607,13 +554,13 @@ number: "1" | "2"
                 "table_1 where it is equals to 1"),
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_grammar=simple_sql_grammar))
+        extra_body=dict(guided_grammar=sample_sql_statements))
 
     content = completion.choices[0].text
 
     # use Lark to parse the output, and make sure it's a valid parse tree
     from lark import Lark
-    parser = Lark(simple_sql_grammar)
+    parser = Lark(sample_sql_statements)
     parser.parse(content)
 
     # remove spaces for comparison b/c we removed them in the grammar
@@ -661,7 +608,8 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str):
+                                          guided_decoding_backend: str,
+                                          sample_json_schema, sample_regex):
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
@@ -673,7 +621,8 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
         _ = await client.completions.create(
             model=MODEL_NAME,
             prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
+            extra_body=dict(guided_regex=sample_regex,
+                            guided_json=sample_json_schema))
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_guided_processors.py b/tests/entrypoints/openai/test_guided_processors.py
index 27568d3e7..85cb4d522 100644
--- a/tests/entrypoints/openai/test_guided_processors.py
+++ b/tests/entrypoints/openai/test_guided_processors.py
@@ -10,59 +10,17 @@ from vllm.model_executor.guided_decoding import (
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 
-TEST_SCHEMA = {
-    "type": "object",
-    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
-        "skills": {
-            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
-        },
-        "work history": {
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "string"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
-                },
-                "required": ["company", "position"]
-            }
-        }
-    },
-    "required": ["name", "age", "skills", "work history"]
-}
 
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-
-def test_guided_logits_processors():
+def test_guided_logits_processors(sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
-    regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
-    json_LP = JSONLogitsProcessor(TEST_SCHEMA,
+    regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
+    json_LP = JSONLogitsProcessor(sample_json_schema,
                                   tokenizer,
                                   whitespace_pattern=None)
 
     token_ids = tokenizer.encode(
-        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+        f"Give an example IPv4 address with this regex: {sample_regex}")
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
     regex_LP(token_ids, tensor)
@@ -70,7 +28,8 @@ def test_guided_logits_processors():
     assert not torch.allclose(tensor, original_tensor)
 
     token_ids = tokenizer.encode(
-        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+        f"Give an employee profile that fits this schema: {sample_json_schema}"
+    )
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
     json_LP(token_ids, tensor)
@@ -80,13 +39,14 @@ def test_guided_logits_processors():
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
-async def test_guided_logits_processor_black_box(backend: str):
+async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+                                                 sample_json_schema):
     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
     token_ids = tokenizer.encode(
-        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+        f"Give an example IPv4 address with this regex: {sample_regex}")
     regex_request = CompletionRequest(model='test',
                                       prompt=token_ids,
-                                      guided_regex=TEST_REGEX)
+                                      guided_regex=sample_regex)
     regex_lp = await get_guided_decoding_logits_processor(
         backend, regex_request, tokenizer)
     assert regex_lp is not None
@@ -97,10 +57,11 @@ async def test_guided_logits_processor_black_box(backend: str):
     assert not torch.allclose(tensor, original_tensor)
 
     token_ids = tokenizer.encode(
-        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+        f"Give an employee profile that fits this schema: {sample_json_schema}"
+    )
     json_request = CompletionRequest(model='test',
                                      prompt=token_ids,
-                                     guided_json=TEST_SCHEMA)
+                                     guided_json=sample_json_schema)
     json_lp = await get_guided_decoding_logits_processor(
         backend, json_request, tokenizer)
     assert json_lp is not None
-- 
GitLab


From b75bce1008fba05ae6e0dcb8060a61015a3c0129 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 12 Jul 2024 09:58:38 -0700
Subject: [PATCH 333/376] [ci] Add grouped tests & mark tests to run by default
 for fastcheck pipeline (#6365)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3e22310ea..9f388b6f8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -7,8 +7,33 @@
 
 
 steps:
+- label: Async Engine, Inputs, Utils, Worker Test
+  fast_check: true
+  fast_check_only: true
+  commands:
+  - pytest -v -s async_engine # Async Engine
+  - bash ../.buildkite/download-images.sh # Inputs 
+  - pytest -v -s test_inputs.py
+  - pytest -v -s multimodal
+  - pytest -v -s test_utils.py # Utils
+  - pytest -v -s worker # Worker
+
+- label: Tensorizer, Metrics, Tracing Test
+  fast_check: true
+  fast_check_only: true
+  commands:
+  - apt-get install curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
+  - pytest -v -s metrics # Metrics
+  - "pip install \
+      opentelemetry-sdk \
+      opentelemetry-api \
+      opentelemetry-exporter-otlp \
+      opentelemetry-semantic-conventions-ai" # Tracing
+  - pytest -v -s tracing
+
 - label: Regression Test
   mirror_hardwares: [amd]
+  fast_check: true
   command: pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
@@ -18,6 +43,7 @@ steps:
 
 - label: Basic Correctness Test
   mirror_hardwares: [amd]
+  fast_check: true
   commands:
   - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
@@ -29,6 +55,7 @@ steps:
 
 - label: Core Test
   mirror_hardwares: [amd]
+  fast_check: true
   commands: 
   - pytest -v -s core
   - pytest -v -s distributed/test_parallel_state.py
@@ -68,6 +95,7 @@ steps:
   #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
+  fast_check: true
   commands:
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
@@ -95,6 +123,7 @@ steps:
   - pytest -v -s tokenization
 
 - label: Entrypoints Test
+  fast_check: true
   mirror_hardwares: [amd]
 
   commands:
@@ -228,6 +257,7 @@ steps:
 
 - label: Documentation Build
   working_dir: "/vllm-workspace/test_docs/docs"
+  fast_check: true
   no_gpu: True
   commands:
   - pip install -r requirements-docs.txt
-- 
GitLab


From 4dbebd03cc0ba514174648c65250ec3faac8ef69 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 12 Jul 2024 11:36:26 -0700
Subject: [PATCH 334/376] [ci] Add GHA workflows to enable full CI run (#6381)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .github/workflows/add_label_automerge.yml     | 21 +++++++++++++++++
 .github/workflows/add_label_ready_comment.yml | 23 +++++++++++++++++++
 .github/workflows/reminder_comment.yml        | 21 +++++++++++++++++
 3 files changed, 65 insertions(+)
 create mode 100644 .github/workflows/add_label_automerge.yml
 create mode 100644 .github/workflows/add_label_ready_comment.yml
 create mode 100644 .github/workflows/reminder_comment.yml

diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
new file mode 100644
index 000000000..cd53b764c
--- /dev/null
+++ b/.github/workflows/add_label_automerge.yml
@@ -0,0 +1,21 @@
+name: Add label on auto-merge enabled
+on:
+    pull_request_target:
+        types:
+            - auto_merge_enabled
+jobs:
+    add-label-on-auto-merge:
+        runs-on: ubuntu-latest
+        steps:
+            -   name: Add label
+                uses: actions/github-script@v5
+                with:
+                    script: |
+                        github.rest.issues.addLabels({
+                            owner: context.repo.owner,
+                            repo: context.repo.repo,
+                            issue_number: context.issue.number,
+                            labels: ['ready']
+                        })
+                env:
+                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/add_label_ready_comment.yml b/.github/workflows/add_label_ready_comment.yml
new file mode 100644
index 000000000..729c1452a
--- /dev/null
+++ b/.github/workflows/add_label_ready_comment.yml
@@ -0,0 +1,23 @@
+name: Add Ready Label on Ready Comment
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  add-ready-label:
+    runs-on: ubuntu-latest
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
+    steps:
+        -   name: Add label
+            uses: actions/github-script@v5
+            with:
+                script: |
+                    github.rest.issues.addLabels({
+                        owner: context.repo.owner,
+                        repo: context.repo.repo,
+                        issue_number: context.issue.number,
+                        labels: ['ready']
+                    })
+            env:
+                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
new file mode 100644
index 000000000..978d81ad5
--- /dev/null
+++ b/.github/workflows/reminder_comment.yml
@@ -0,0 +1,21 @@
+name: PR Reminder Comment Bot
+on:
+  pull_request_target:
+    types: [opened]
+
+jobs:
+  pr_reminder:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remind to run full CI on PR
+        uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only trigger `fastcheck` CI to run, which consists only a small and essential subset of tests to quickly catch small errors.\n\nFull CI run is still required to merge this PR so please make sure that you run full CI before merging or if you need more test signals.\n\n To run full CI, you can do one of these:\n- Add `ready` label to the PR\n- Comment `/ready` on the PR\n- Enable auto-merge.\n\n🚀'
+            })
+        env:
+                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-- 
GitLab


From aa48e502fba074a3c3afeeba0267d0f9e9f205db Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 12 Jul 2024 12:04:26 -0700
Subject: [PATCH 335/376] [MISC] Upgrade dependency to PyTorch 2.3.1 (#5327)

---
 .github/workflows/publish.yml | 2 +-
 CMakeLists.txt                | 2 +-
 pyproject.toml                | 2 +-
 requirements-build.txt        | 2 +-
 requirements-cuda.txt         | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9c35ede5f..15c2ec05b 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          pytorch-version: ['2.3.1']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31f7a9738..ced73ca03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
 
 #
diff --git a/pyproject.toml b/pyproject.toml
index 790e01362..1ba1eacd9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.3.0",
+    "torch == 2.3.1",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index 1a07a94e8..b05f38a0e 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.3.0
+torch==2.3.1
 wheel
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 10596ed85..3eb91212e 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,8 +4,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.3.0
+torch == 2.3.1
 # These must be updated alongside torch
-torchvision == 0.18.0   # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
-vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
+torchvision == 0.18.1   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27  # Requires PyTorch 2.3.1
+vllm-flash-attn == 2.5.9.post1  # Requires PyTorch 2.3.1
-- 
GitLab


From d719ba24c5b4e669bf51b49293cab09f2ce7361c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 12 Jul 2024 13:56:59 -0700
Subject: [PATCH 336/376] Build some nightly wheels by default (#6380)

---
 .buildkite/release-pipeline.yaml             | 34 ++++++++++++++++++--
 docs/source/getting_started/installation.rst | 14 ++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index c394f3fd7..c624c893d 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,5 +1,24 @@
 steps:
-  - block: "Build wheels"
+  - label: "Build wheel default - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" 
+    agents:
+      queue: cpu_queue
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image -e CMAKE_BUILD_TYPE=Release --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+      # rename the files to change linux -> manylinux1
+      - "for f in artifacts/dist/*.whl; do mv -- \"$f\" \"${f/linux/manylinux1}\"; done"
+      - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/nightly/"
+    matrix:
+      setup:
+        cuda_version:
+          - "12.1.0"
+        python_version:
+          - "3.10"
+          - "3.11"
+
+  - block: "Build wheels full"
 
   - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}"
     agents:
@@ -8,7 +27,9 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "for f in artifacts/dist/*.whl; do mv -- \"$f\" \"${f/linux/manylinux1}\"; done
+      - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/nightly/"
     matrix:
       setup:
         cuda_version:
@@ -19,3 +40,12 @@ steps:
           - "3.9"
           - "3.10"
           - "3.11"
+      adjustments:
+      - with:
+          cuda_version: "12.1.0"
+          python_version: "3.10"
+        skip: true
+      - with:
+          cuda_version: "12.1.0"
+          python_version: "3.11"
+        skip: true
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index d458b0235..a9dfac8ff 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -42,6 +42,20 @@ You can install vLLM using pip:
 
     Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
 
+.. note::
+
+    vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command:
+
+    .. code-block:: console
+
+        $ export VLLM_VERSION=0.5.2 # vLLM's main branch version is currently set to latest released tag
+        $ export PYTHON_VERSION=310
+        $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
+        $ # You can also access a specific commit
+        $ # export VLLM_COMMIT=...
+        $ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
+
+
 .. _build_from_source:
 
 Build from source
-- 
GitLab


From bb1a784b05cde68a2a657431c4ab21839ca4c756 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 12 Jul 2024 14:00:57 -0700
Subject: [PATCH 337/376] Fix release-pipeline.yaml (#6388)

---
 .buildkite/release-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index c624c893d..62483950d 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -27,7 +27,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
-      - "for f in artifacts/dist/*.whl; do mv -- \"$f\" \"${f/linux/manylinux1}\"; done
+      - "for f in artifacts/dist/*.whl; do mv -- \"$f\" \"${f/linux/manylinux1}\"; done"
       - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
       - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/nightly/"
     matrix:
-- 
GitLab


From 07b35af86d984e529e4099b0d348b71ad820c23d Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 12 Jul 2024 14:03:39 -0700
Subject: [PATCH 338/376] Fix interpolation in release pipeline (#6389)

---
 .buildkite/release-pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 62483950d..1fa497420 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -7,7 +7,7 @@ steps:
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
       # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$f\" \"${f/linux/manylinux1}\"; done"
+      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
       - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
       - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/nightly/"
     matrix:
@@ -27,7 +27,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
-      - "for f in artifacts/dist/*.whl; do mv -- \"$f\" \"${f/linux/manylinux1}\"; done"
+      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
       - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
       - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/nightly/"
     matrix:
-- 
GitLab


From 21b2dcedabb84332a73e8cda3886a6988c52a7df Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 12 Jul 2024 14:08:04 -0700
Subject: [PATCH 339/376] Fix release pipeline's -e flag (#6390)

---
 .buildkite/release-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 1fa497420..138281c8f 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -3,7 +3,7 @@ steps:
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image -e CMAKE_BUILD_TYPE=Release --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
       # rename the files to change linux -> manylinux1
-- 
GitLab


From 75f64d8b94d012ea37dddde1058ce17e55001a4a Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 12 Jul 2024 14:33:33 -0700
Subject: [PATCH 340/376] [Bugfix] Fix illegal memory access in FP8 MoE kernel
 (#6382)

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index a29622b7d..3c62008fb 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -492,12 +492,14 @@ def fused_experts(hidden_states: torch.Tensor,
         if tokens_in_chunk == 0:
             break
 
-        if tokens_in_chunk < CHUNK_SIZE:
-            # will only happen in the last chunk
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            # Adjust the intermediate cache size and config for the last
+            # chunk. Note that in most cases we only have one chunk
+            # so the cache size and config are already set correctly and
+            # do not need to be adjusted.
             intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
             intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
             intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
-            # reload config to get better performance on the last chunk
             config = get_config_func(tokens_in_chunk)
 
         curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
-- 
GitLab


From 111fc6e7ecb8e5eb665606cf79209495e874479b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 12 Jul 2024 18:52:15 -0400
Subject: [PATCH 341/376] [Misc] Add generated git commit hash as
 `vllm.__commit__` (#6386)

---
 .gitignore                    |  3 +++
 setup.py                      | 24 ++++++++++++++++++++++++
 tests/test_embedded_commit.py |  7 +++++++
 vllm/__init__.py              |  3 ++-
 vllm/version.py               | 11 +++++++++++
 5 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_embedded_commit.py

diff --git a/.gitignore b/.gitignore
index e077366d1..17184b191 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# vllm commit id, generated by setup.py
+vllm/commit_id.py
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/setup.py b/setup.py
index 067ad13fe..485cfe36b 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@ import os
 import re
 import subprocess
 import sys
+import warnings
 from shutil import which
 from typing import Dict, List
 
@@ -26,6 +27,29 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
 
+
+def embed_commit_hash():
+    try:
+        commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
+                                            encoding="utf-8").strip()
+        commit_contents = f'__commit__ = "{commit_id}"\n'
+
+        version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
+        with open(version_file, "w", encoding="utf-8") as f:
+            f.write(commit_contents)
+
+    except subprocess.CalledProcessError as e:
+        warnings.warn(f"Failed to get commit hash:\n{e}",
+                      RuntimeWarning,
+                      stacklevel=2)
+    except Exception as e:
+        warnings.warn(f"Failed to embed commit hash:\n{e}",
+                      RuntimeWarning,
+                      stacklevel=2)
+
+
+embed_commit_hash()
+
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
new file mode 100644
index 000000000..17b01651e
--- /dev/null
+++ b/tests/test_embedded_commit.py
@@ -0,0 +1,7 @@
+import vllm
+
+
+def test_embedded_commit_defined():
+    assert vllm.__commit__ != "COMMIT_HASH_PLACEHOLDER"
+    # 7 characters is the length of a short commit hash
+    assert len(vllm.__commit__) >= 7
diff --git a/vllm/__init__.py b/vllm/__init__.py
index e21705987..318f078fd 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -12,9 +12,10 @@ from vllm.outputs import (CompletionOutput, EmbeddingOutput,
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
-from .version import __version__
+from .version import __commit__, __version__
 
 __all__ = [
+    "__commit__",
     "__version__",
     "LLM",
     "ModelRegistry",
diff --git a/vllm/version.py b/vllm/version.py
index dd9b22ccc..309f97954 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1 +1,12 @@
+import warnings
+
+try:
+    import vllm.commit_id
+    __commit__ = vllm.commit_id.__commit__
+except Exception as e:
+    warnings.warn(f"Failed to read commit hash:\n{e}",
+                  RuntimeWarning,
+                  stacklevel=2)
+    __commit__ = "COMMIT_HASH_PLACEHOLDER"
+
 __version__ = "0.5.1"
-- 
GitLab


From 6bc9710f6e623adaa05b462171454f0bf543e9b0 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 12 Jul 2024 15:52:43 -0700
Subject: [PATCH 342/376] Fix release pipeline's dir permission (#6391)

---
 .buildkite/release-pipeline.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 138281c8f..4a35ce925 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -5,11 +5,11 @@ steps:
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       # rename the files to change linux -> manylinux1
       - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/nightly/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
     matrix:
       setup:
         cuda_version:
@@ -26,10 +26,10 @@ steps:
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive --acl public-read artifacts/dist s3://vllm-wheels/nightly/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
     matrix:
       setup:
         cuda_version:
-- 
GitLab


From f8f9ff57ee365891fe9f54cd46df65cc9d5ccca0 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 12 Jul 2024 15:59:47 -0700
Subject: [PATCH 343/376] [Bugfix][TPU] Fix megacore setting for v5e-litepod
 (#6397)

---
 vllm/attention/backends/pallas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 7a6954ceb..c45f7b28b 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -116,7 +116,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
 
         self.megacore_mode = None
         tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
-        if not tpu_type.endswith("lite"):
+        if "lite" not in tpu_type:
             if self.num_kv_heads % 2 == 0:
                 self.megacore_mode = "kv_head"
             else:
-- 
GitLab


From 16ff6bd58ca20b008eaf7491d5422dfd80737570 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 12 Jul 2024 16:34:37 -0700
Subject: [PATCH 344/376] [ci] Fix wording for GH bot (#6398)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .github/workflows/reminder_comment.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 978d81ad5..d6924a30a 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -15,7 +15,7 @@ jobs:
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only trigger `fastcheck` CI to run, which consists only a small and essential subset of tests to quickly catch small errors.\n\nFull CI run is still required to merge this PR so please make sure that you run full CI before merging or if you need more test signals.\n\n To run full CI, you can do one of these:\n- Add `ready` label to the PR\n- Comment `/ready` on the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only trigger `fastcheck` CI to run, which consists only a small and essential subset of tests to quickly catch errors with the flexibility to run extra individual tests on top (you can do this by unblocking test steps in the Buildkite run). \n\nFull CI run is still required to merge this PR so once the PR is ready to go, please make sure to run it. If you need all test signals in between PR commits, you can trigger full CI as well.\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
             })
         env:
-                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-- 
GitLab


From a27f87da3429eafbbaf4f5372bf458ae40e30618 Mon Sep 17 00:00:00 2001
From: Saliya Ekanayake <esaliya@gmail.com>
Date: Fri, 12 Jul 2024 17:48:23 -0700
Subject: [PATCH 345/376] [Doc] Fix Typo in Doc (#6392)

Co-authored-by: Saliya Ekanayake <esaliya@d-matrix.ai>
---
 README.md             | 2 +-
 docs/source/index.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 53285356a..6927c4a0d 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ vLLM is flexible and easy to use with:
 
 - Seamless integration with popular Hugging Face models
 - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-- Tensor parallelism and pipieline parallelism support for distributed inference
+- Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 174d91b8d..2691805ed 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,7 +38,7 @@ vLLM is flexible and easy to use with:
 
 * Seamless integration with popular HuggingFace models
 * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-* Tensor parallelism and pipieline parallelism support for distributed inference
+* Tensor parallelism and pipeline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
 * Support NVIDIA GPUs and AMD GPUs
-- 
GitLab


From e1684a766ad3f2f1531c273fa8a056fc14c4c71e Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 13 Jul 2024 03:30:54 +0200
Subject: [PATCH 346/376] [Bugfix] Fix hard-coded value of x in
 context_attention_fwd (#6373)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/attention/ops/prefix_prefill.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 4cd4976ad..70b544b60 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -718,7 +718,7 @@ if triton.__version__ >= "2.1.0":
                 b_ctx_len,
                 alibi_slopes,
                 v_cache.shape[3],
-                8,
+                k_cache.shape[4],
                 o,
                 b_loc.stride(0),
                 b_loc.stride(1),
@@ -768,7 +768,7 @@ if triton.__version__ >= "2.1.0":
             b_seq_len,
             b_ctx_len,
             v_cache.shape[3],
-            8,
+            k_cache.shape[4],
             o,
             b_loc.stride(0),
             b_loc.stride(1),
-- 
GitLab


From d80aef37764feda51e21065de9c669785fe1d94a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 12 Jul 2024 19:36:53 -0700
Subject: [PATCH 347/376] [Docs] Clean up latest news (#6401)

---
 README.md | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 6927c4a0d..b07bfcdd5 100644
--- a/README.md
+++ b/README.md
@@ -16,27 +16,12 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-**Ray Summit CPF is Open (June 4th to June 20th)!**
-
-There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
-If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
-This will be a great chance for everyone in the community to get together and learn.
-Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
-
----
-
 *Latest News* 🔥
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
-- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
-- [2024/01] Added ROCm 6.0 support to vLLM.
-- [2023/12] Added ROCm 5.7 support to vLLM.
-- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
-- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
-- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
-- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
-- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 
 ---
-- 
GitLab


From 41708e50341c82668fd25ebc7777470cba6f5303 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Jul 2024 21:51:48 -0700
Subject: [PATCH 348/376] [ci] try to add multi-node tests (#6280)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
---
 .buildkite/run-multi-node-test.sh             | 52 ++++++++---
 .buildkite/test-pipeline.yaml                 | 16 +++-
 tests/async_engine/test_openapi_server_ray.py | 37 +++-----
 tests/distributed/test_pipeline_parallel.py   | 17 +---
 tests/distributed/test_same_node.py           |  1 +
 tests/entrypoints/openai/test_chat.py         | 57 +++++-------
 tests/entrypoints/openai/test_completion.py   | 57 +++++-------
 tests/entrypoints/openai/test_embedding.py    | 35 +++----
 tests/entrypoints/openai/test_models.py       | 57 +++++-------
 tests/entrypoints/openai/test_vision.py       | 33 +++----
 tests/tensorizer_loader/test_tensorizer.py    | 33 +++----
 tests/utils.py                                | 93 +++++++------------
 vllm/executor/ray_gpu_executor.py             | 17 ++--
 13 files changed, 230 insertions(+), 275 deletions(-)

diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
index 0d94b2555..7ac4dcc4c 100755
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@@ -2,16 +2,17 @@
 
 set -euox pipefail
 
-if [[ $# -lt 3 ]]; then
-    echo "Please provide the number of nodes and GPU per node."
+if [[ $# -lt 4 ]]; then
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
     exit 1
 fi
 
-NUM_NODES=$1
-NUM_GPUS=$2
-DOCKER_IMAGE=$3
+WORKING_DIR=$1
+NUM_NODES=$2
+NUM_GPUS=$3
+DOCKER_IMAGE=$4
 
-shift 3
+shift 4
 COMMANDS=("$@")
 if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
     echo "The number of commands must be equal to the number of nodes."
@@ -40,13 +41,40 @@ start_nodes() {
             fi
         done
         GPU_DEVICES+='"'
-        # echo "Starting node$node with GPU devices: $GPU_DEVICES"
-        docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
+
+        # start the container in detached mode
+        # things to note:
+        # 1. --shm-size=10.24gb is required. don't use --ipc=host
+        # 2. pass HF_TOKEN to the container
+        # 3. map the huggingface cache directory to the container
+        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
+        #    starting from 192.168.10.11)
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+
+        # organize containers into a ray cluster
+        if [ $node -eq 0 ]; then
+            # start the ray head node
+            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            # wait for the head node to be ready
+            sleep 10
+        else
+            # start the ray worker nodes, and connect them to the head node
+            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+        fi
     done
+
+    # wait for the cluster to be ready
+    sleep 10
+
+    # print the cluster status
+    docker exec node0 /bin/bash -c "ray status"
 }
 
 run_nodes() {
-    for node in $(seq 0 $(($NUM_NODES-1))); do
+    # important: iterate in reverse order to start the head node last
+    # we start the worker nodes first, in detached mode, and then start the head node
+    # in the foreground, so that the output of the head node is visible in the buildkite logs
+    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
         GPU_DEVICES='"device='
         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
@@ -57,10 +85,10 @@ run_nodes() {
         done
         GPU_DEVICES+='"'
         echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ $node -lt $(($NUM_NODES - 1)) ]; then
-            docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
+        if [ $node -ne 0 ]; then
+            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
         else
-            docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
+            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
         fi
     done
 }
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9f388b6f8..c8f53224b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -68,6 +68,17 @@ steps:
   - pytest -v -s distributed/test_comm_ops.py
   - pytest -v -s distributed/test_shm_broadcast.py
 
+- label: 2 Node Tests (4 GPUs in total)
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+
 - label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
@@ -213,7 +224,10 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+  commands:
+    - apt-get install curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader
 
 - label: Metrics Test
   mirror_hardwares: [amd]
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index cc05d79e5..575f8f19b 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -1,35 +1,26 @@
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--max-model-len",
-        "2048",
-        "--enforce-eager",
-        "--engine-use-ray"
-    ])
+def server():
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "float16",
+            "--max-model-len",
+            "2048",
+            "--enforce-eager",
+            "--engine-use-ray"
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 6072a2dd7..2d9f63795 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -2,11 +2,8 @@ import os
 
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # downloading lora to test lora requests
 
@@ -21,14 +18,7 @@ pytestmark = pytest.mark.asyncio
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
+def server():
     args = [
         "--model",
         MODEL_NAME,
@@ -50,7 +40,8 @@ def server(ray_ctx):
         args += [
             "--enforce-eager",
         ]
-    return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE)
+    with RemoteOpenAIServer(args) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 2d886eb56..07e84d0ad 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -10,3 +10,4 @@ test_result = all(
 
 expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
 assert test_result == expected, f"Expected {expected}, got {test_result}"
+print("Same node test passed!")
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d6df82694..d370c63c0 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -6,15 +6,12 @@ from typing import List
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import torch
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -29,35 +26,29 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "128",
-    ])
+def server(zephyr_lora_files):
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+            # lora config below
+            "--enable-lora",
+            "--lora-modules",
+            f"zephyr-lora={zephyr_lora_files}",
+            f"zephyr-lora2={zephyr_lora_files}",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+            "--max-num-seqs",
+            "128",
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index d222981d2..6e5fdebe7 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -6,9 +6,6 @@ from typing import List
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import requests
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
@@ -16,7 +13,7 @@ from openai import BadRequestError
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -31,35 +28,29 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "128",
-    ])
+def server(zephyr_lora_files):
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+            # lora config below
+            "--enable-lora",
+            "--lora-modules",
+            f"zephyr-lora={zephyr_lora_files}",
+            f"zephyr-lora2={zephyr_lora_files}",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+            "--max-num-seqs",
+            "128",
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f8aa1c914..4a32aadc8 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -3,33 +3,26 @@ import base64
 import numpy as np
 import openai
 import pytest
-import ray
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def embedding_server(ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        EMBEDDING_MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--enforce-eager",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-    ])
+def embedding_server():
+    with RemoteOpenAIServer([
+            "--model",
+            EMBEDDING_MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--enforce-eager",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 914ef6e19..bf63f9a81 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -1,12 +1,9 @@
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -21,35 +18,29 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "128",
-    ])
+def server(zephyr_lora_files):
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+            # lora config below
+            "--enable-lora",
+            "--lora-modules",
+            f"zephyr-lora={zephyr_lora_files}",
+            f"zephyr-lora2={zephyr_lora_files}",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+            "--max-num-seqs",
+            "128",
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index b86971760..563b68566 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -3,7 +3,6 @@ from typing import Dict, List
 import openai
 import pytest
 import pytest_asyncio
-import ray
 
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
 
@@ -23,25 +22,19 @@ TEST_IMAGE_URLS = [
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "4096",
-        "--enforce-eager",
-        "--chat-template",
-        str(LLAVA_CHAT_TEMPLATE),
-    ])
+def server():
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "4096",
+            "--enforce-eager",
+            "--chat-template",
+            str(LLAVA_CHAT_TEMPLATE),
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b2ebcc15c..a43f91325 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -6,7 +6,6 @@ from unittest.mock import MagicMock, patch
 
 import openai
 import pytest
-import ray
 import torch
 from tensorizer import EncryptionParams
 
@@ -22,7 +21,7 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          tensorize_vllm_model)
 
 from ..conftest import VllmRunner, cleanup
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # yapf conflicts with isort for this docstring
 
@@ -220,23 +219,21 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
         json.dumps(model_loader_extra_config),
     ]
 
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    with RemoteOpenAIServer(openai_args) as server:
+        print("Server ready.")
 
-    server = RemoteOpenAIServer(openai_args)
-    print("Server ready.")
+        client = server.get_client()
+        completion = client.completions.create(model=model_ref,
+                                            prompt="Hello, my name is",
+                                            max_tokens=5,
+                                            temperature=0.0)
 
-    client = server.get_client()
-    completion = client.completions.create(model=model_ref,
-                                           prompt="Hello, my name is",
-                                           max_tokens=5,
-                                           temperature=0.0)
-
-    assert completion.id is not None
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) >= 5
-    assert completion.choices[0].finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+        assert completion.id is not None
+        assert len(completion.choices) == 1
+        assert len(completion.choices[0].text) >= 5
+        assert completion.choices[0].finish_reason == "length"
+        assert completion.usage == openai.types.CompletionUsage(
+            completion_tokens=5, prompt_tokens=6, total_tokens=11)
 
 
 def test_raise_value_error_on_invalid_load_format(vllm_runner):
@@ -282,7 +279,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
     base_model.model.llm_engine.model_executor.shutdown()
     del base_model
     cleanup()
-    ray.shutdown()
 
     # load model with two shards and serialize with encryption
     model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@@ -305,7 +301,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
     assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
     assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
     cleanup()
-    ray.shutdown()
 
     loaded_vllm_model = vllm_runner(
         model_ref,
diff --git a/tests/utils.py b/tests/utils.py
index ad4d097b0..50f723b0b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -49,53 +49,7 @@ class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
     MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 
-    class _RemoteRunner:
-
-        def __init__(self, cli_args: List[str], *, wait_url: str,
-                     wait_timeout: float) -> None:
-            env = os.environ.copy()
-            env["PYTHONUNBUFFERED"] = "1"
-            self.proc = subprocess.Popen(
-                [
-                    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
-                    *cli_args
-                ],
-                env=env,
-                stdout=sys.stdout,
-                stderr=sys.stderr,
-            )
-
-            self._wait_for_server(url=wait_url, timeout=wait_timeout)
-
-        def ready(self):
-            return True
-
-        def _wait_for_server(self, *, url: str, timeout: float):
-            # run health check
-            start = time.time()
-            while True:
-                try:
-                    if requests.get(url).status_code == 200:
-                        break
-                except Exception as err:
-                    if self.proc.poll() is not None:
-                        raise RuntimeError(
-                            "Server exited unexpectedly.") from err
-
-                    time.sleep(0.5)
-                    if time.time() - start > timeout:
-                        raise RuntimeError(
-                            "Server failed to start in time.") from err
-
-        def __del__(self):
-            if hasattr(self, "proc"):
-                self.proc.terminate()
-
-    def __init__(self,
-                 cli_args: List[str],
-                 *,
-                 auto_port: bool = True,
-                 num_gpus: int = 1) -> None:
+    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
         if auto_port:
             if "-p" in cli_args or "--port" in cli_args:
                 raise ValueError("You have manually specified the port"
@@ -108,13 +62,41 @@ class RemoteOpenAIServer:
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
 
-        self._runner = ray.remote(num_gpus=num_gpus)(
-            self._RemoteRunner).remote(
-                cli_args,
-                wait_url=self.url_for("health"),
-                wait_timeout=self.MAX_SERVER_START_WAIT_S)
-
-        self._wait_until_ready()
+        env = os.environ.copy()
+        # the current process might initialize cuda,
+        # to be safe, we should use spawn method
+        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        self.proc = subprocess.Popen(
+            [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
+            cli_args,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr)
+        self._wait_for_server(url=self.url_for("health"),
+                              timeout=self.MAX_SERVER_START_WAIT_S)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.terminate()
+
+    def _wait_for_server(self, *, url: str, timeout: float):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(url).status_code == 200:
+                    break
+            except Exception as err:
+                result = self.proc.poll()
+                if result is not None and result != 0:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+
+                time.sleep(0.5)
+                if time.time() - start > timeout:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err
 
     @property
     def url_root(self) -> str:
@@ -123,9 +105,6 @@ class RemoteOpenAIServer:
     def url_for(self, *parts: str) -> str:
         return self.url_root + "/" + "/".join(parts)
 
-    def _wait_until_ready(self) -> None:
-        ray.get(self._runner.ready.remote())
-
     def get_client(self):
         return openai.OpenAI(
             base_url=self.url_for("v1"),
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 6e13264ab..388f934ef 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -224,16 +224,13 @@ class RayGPUExecutor(DistributedGPUExecutor):
         # broadcasted to.
         self.non_driver_workers: List[RayWorkerWrapper] = []
 
-        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
-            for tp_rank in range(self.parallel_config.tensor_parallel_size):
-                rank = (pp_rank *
-                        self.parallel_config.tensor_parallel_size) + tp_rank
-                if rank == 0:
-                    pass
-                elif rank % self.parallel_config.tensor_parallel_size == 0:
-                    self.tp_driver_workers.append(self.workers[rank - 1])
-                else:
-                    self.non_driver_workers.append(self.workers[rank - 1])
+        for idx, rank in enumerate(worker_ranks[1:]):
+            # We need to skip the driver worker, which we
+            # do by skipping worker_ranks[0] which is always 0.
+            if rank % self.parallel_config.tensor_parallel_size == 0:
+                self.tp_driver_workers.append(self.workers[idx])
+            else:
+                self.non_driver_workers.append(self.workers[idx])
 
     def _driver_execute_model(
         self, execute_model_req: Optional[ExecuteModelRequest]
-- 
GitLab


From 9da4aad44b7878032ef2bb32eb1b4e1ab86f8351 Mon Sep 17 00:00:00 2001
From: Noam Gat <noamgat@gmail.com>
Date: Sat, 13 Jul 2024 13:09:12 +0300
Subject: [PATCH 349/376] Updating LM Format Enforcer version to v10.3 (#6411)

---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 829e9a285..29643cfce 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -17,7 +17,7 @@ pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.1
+lm-format-enforcer == 0.10.3
 outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
-- 
GitLab


From babf52dade78ff3b1bea6cb6e9f4151dfd630251 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 13 Jul 2024 06:21:37 -0400
Subject: [PATCH 350/376] [ Misc ] More Cleanup of Marlin (#6359)

Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
---
 .../run-lm-eval-gsm-vllm-baseline.sh          |  2 +-
 .../layers/quantization/gptq_marlin.py        | 78 ++++++++-----------
 .../layers/quantization/utils/marlin_utils.py | 12 +++
 3 files changed, 44 insertions(+), 48 deletions(-)

diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index d68c6993e..1bddbd89e 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.2
+#   pip install lm-eval==0.4.3
 
 usage() {
     echo``
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 7b808f521..07a73d06e 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -10,8 +10,9 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    check_marlin_supported, marlin_make_empty_g_idx, marlin_make_workspace,
-    marlin_permute_scales, marlin_sort_g_idx, replace_tensor,
+    apply_marlin_linear, check_marlin_supported, marlin_is_k_full,
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
+    marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
     verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 
@@ -145,6 +146,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
     ) -> None:
         del output_size
         output_size_per_partition = sum(output_partition_sizes)
+        is_row_parallel = input_size != input_size_per_partition
 
         # Normalize group_size
         if self.quant_config.group_size != -1:
@@ -158,32 +160,19 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
             input_size=input_size,
             group_size=group_size)
 
-        # Detect sharding of scales/zp
-
-        # By default, no sharding over "input dim"
-        scales_and_zp_size = input_size // group_size
-        scales_and_zp_input_dim = None
-
-        if self.quant_config.desc_act:
-            # Act-order case
-            assert self.quant_config.group_size != -1
-
-            is_k_full = input_size_per_partition == input_size
-
+        # Determine sharding
+        if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+                                             self.quant_config.group_size,
+                                             is_row_parallel):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
         else:
-            # No act-order case
-
-            # K is always full due to full alignment with
-            # group-size and shard of scales/zp
-            is_k_full = True
-
-            # If this is a row-parallel case, then shard scales/zp
-            if (input_size != input_size_per_partition
-                    and self.quant_config.group_size != -1):
-                scales_and_zp_size = input_size_per_partition // group_size
-                scales_and_zp_input_dim = 0
-
-        # Init buffers
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
 
         # Quantized weights
         qweight = Parameter(
@@ -268,13 +257,15 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
-        layer.is_k_full = is_k_full
+        layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act,
+                                           is_row_parallel)
 
     # Checkpoints are serialized in AutoGPTQ format, which is different from the
     # marlin format. This function is called after the weights are loaded.
     # Here, we handle the repacking, including the activation reordering case.
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         device = layer.qweight.device
+
         # Allocate marlin workspace
         layer.workspace = marlin_make_workspace(
             layer.output_size_per_partition, device)
@@ -312,22 +303,15 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        reshaped_x = x.reshape(-1, x.shape[-1])
-        out_shape = x.shape[:-1] + (layer.output_size_per_partition, )
-
-        output = ops.gptq_marlin_gemm(reshaped_x,
-                                      layer.qweight,
-                                      layer.scales,
-                                      g_idx=layer.g_idx,
-                                      perm=layer.g_idx_sort_indices,
-                                      workspace=layer.workspace,
-                                      num_bits=self.quant_config.weight_bits,
-                                      size_m=reshaped_x.shape[0],
-                                      size_n=layer.output_size_per_partition,
-                                      size_k=layer.input_size_per_partition,
-                                      is_k_full=layer.is_k_full)
-
-        if bias is not None:
-            output.add_(bias)  # In-place add
-
-        return output.reshape(out_shape)
+        return apply_marlin_linear(
+            input=x,
+            weight=layer.qweight,
+            weight_scale=layer.scales,
+            g_idx=layer.g_idx,
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=layer.workspace,
+            num_bits=self.quant_config.weight_bits,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            is_k_full=layer.is_k_full,
+            bias=bias)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 612c5fd20..764f0a6f3 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -91,6 +91,18 @@ def marlin_make_workspace(output_size_per_partition: int,
                        requires_grad=False)
 
 
+def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                      is_row_parallel: bool) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
     return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
                               requires_grad=False)
-- 
GitLab


From eeceadaecc80cc51c4e9ddae0cb99a33d379452d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 13 Jul 2024 11:52:22 -0700
Subject: [PATCH 351/376] [Misc] Add deprecation warning for beam search
 (#6402)

---
 vllm/envs.py            |  5 +++++
 vllm/sampling_params.py | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index c624510c7..5b4a2010d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -41,6 +41,7 @@ if TYPE_CHECKING:
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
     VLLM_INSTALL_PUNICA_KERNELS: bool = False
+    VLLM_NO_DEPRECATION_WARNING: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
 
@@ -251,6 +252,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
     "VLLM_FUSED_MOE_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
+
+    # If set, vllm will skip the deprecation warnings.
+    "VLLM_NO_DEPRECATION_WARNING":
+    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index a2caae21a..90f0944a7 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -8,6 +8,11 @@ import torch
 from pydantic import Field
 from typing_extensions import Annotated
 
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 _SAMPLING_EPS = 1e-5
 
 
@@ -184,6 +189,13 @@ class SamplingParams:
 
         self._verify_args()
         if self.use_beam_search:
+            if not envs.VLLM_NO_DEPRECATION_WARNING:
+                logger.warning(
+                    "[IMPORTANT] We plan to discontinue the support for beam "
+                    "search in the next major release. Please refer to "
+                    "https://github.com/vllm-project/vllm/issues/6226 for "
+                    "more information. Set VLLM_NO_DEPRECATION_WARNING=1 to "
+                    "suppress this warning.")
             self._verify_beam_search()
         else:
             self._verify_non_beam_search()
-- 
GitLab


From fb6af8bc086328ca6659e72d11ffd4309ce4de22 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 13 Jul 2024 23:03:58 -0400
Subject: [PATCH 352/376] [ Misc ] Apply MoE Refactor to Deepseekv2 To Support
 Fp8 (#6417)

---
 .../configs/DeepSeek-V2-Lite-Chat.yaml        |  11 ++
 .../lm-eval-harness/configs/models-large.txt  |   1 +
 .../run-lm-eval-gsm-vllm-baseline.sh          |   2 +-
 .../layers/fused_moe/fused_moe.py             |  36 +++--
 vllm/model_executor/layers/fused_moe/layer.py |  93 ++++++++++--
 .../model_executor/layers/quantization/fp8.py |  10 +-
 vllm/model_executor/models/deepseek_v2.py     | 142 +++++++++---------
 vllm/model_executor/models/mixtral.py         |  32 +---
 vllm/model_executor/models/qwen2_moe.py       |  33 ++--
 9 files changed, 223 insertions(+), 137 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml

diff --git a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
new file mode 100644
index 000000000..15268395e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
+model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.671
+  - name: "exact_match,flexible-extract"
+    value: 0.664
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
index 2007dd2e1..94b15a872 100644
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -1,3 +1,4 @@
 Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
+DeepSeek-V2-Lite-Chat.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 1bddbd89e..dbb21be4f 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray" \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray",trust_remote_code=true \
   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
   --batch_size $BATCH_SIZE
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3c62008fb..413c0b6d0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -394,14 +394,16 @@ def fused_topk(
 
 
 # This is used by the Deepseek-V2 model
-def grouped_topk(
-    hidden_states: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk: int,
-    renormalize: bool,
-    num_expert_group: int = 0,
-    topk_group: int = 0,
-):
+def grouped_topk(hidden_states: torch.Tensor,
+                 gating_output: torch.Tensor,
+                 topk: int,
+                 renormalize: bool,
+                 num_expert_group: int = 0,
+                 topk_group: int = 0):
+
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+
     scores = torch.softmax(gating_output, dim=-1)
     num_token = scores.shape[0]
     group_scores = scores.view(num_token, num_expert_group,
@@ -557,6 +559,9 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     override_config: Optional[Dict[str, Any]] = None,
+    use_grouped_topk: bool = False,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
@@ -579,6 +584,10 @@ def fused_moe(
         Defaults to False.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
+    - num_expert_group: Optional[int]: additional parameter for grouped_topk
+    - topk_group: Optional[int]: additional parameter for grouped_topk
+    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
+        note: Deepseekv2 model uses grouped_topk
     - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
         products for w1 and w2. Defaults to False.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
@@ -592,8 +601,15 @@ def fused_moe(
     # Check constraints.
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
+    if use_grouped_topk:
+        assert num_expert_group is not None and topk_group is not None
+        topk_weights, topk_ids = grouped_topk(hidden_states, gating_output,
+                                              topk, renormalize,
+                                              num_expert_group, topk_group)
+    else:
+        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                            renormalize)
+
     return fused_experts(hidden_states,
                          w1,
                          w2,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 73cfcd7fc..3904f3e3d 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import Optional
+from typing import List, Optional, Tuple
 
 import torch
 
@@ -29,7 +29,10 @@ class FusedMoEMethodBase(QuantizeMethodBase):
               x: torch.Tensor,
               router_logits: torch.Tensor,
               top_k: int,
-              renormalize: bool = True) -> torch.Tensor:
+              renormalize: bool = True,
+              use_grouped_topk: bool = False,
+              num_expert_group: Optional[int] = None,
+              topk_group: Optional[int] = None) -> torch.Tensor:
         raise NotImplementedError
 
 
@@ -63,7 +66,10 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase):
               x: torch.Tensor,
               router_logits: torch.Tensor,
               top_k: int,
-              renormalize: bool = True) -> torch.Tensor:
+              renormalize: bool = True,
+              use_grouped_topk: bool = False,
+              num_expert_group: Optional[int] = None,
+              topk_group: Optional[int] = None) -> torch.Tensor:
 
         return fused_moe(x,
                          layer.w13_weight,
@@ -71,7 +77,10 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase):
                          router_logits,
                          top_k,
                          renormalize=renormalize,
-                         inplace=True)
+                         inplace=True,
+                         use_grouped_topk=use_grouped_topk,
+                         num_expert_group=num_expert_group,
+                         topk_group=topk_group)
 
 
 class FusedMoE(torch.nn.Module):
@@ -104,6 +113,9 @@ class FusedMoE(torch.nn.Module):
         params_dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
     ):
@@ -119,6 +131,11 @@ class FusedMoE(torch.nn.Module):
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
         self.renormalize = renormalize
+        self.use_grouped_topk = use_grouped_topk
+        if self.use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
 
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -140,9 +157,8 @@ class FusedMoE(torch.nn.Module):
                       shard_id: int, expert_id: int):
         param_data = param.data
 
-        # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral.
-        # Follow up PR to enable fp8 for other MoE models.
-        if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
+        # Input scales can be loaded directly and should be equal.
+        if "input_scale" in weight_name:
             if param_data[expert_id] != 1 and (param_data[expert_id] -
                                                loaded_weight).abs() > 1e-5:
                 raise ValueError(
@@ -150,14 +166,21 @@ class FusedMoE(torch.nn.Module):
                     f"must be equal. But got {param_data[expert_id]} "
                     f"vs. {loaded_weight}")
             param_data[expert_id] = loaded_weight
-        # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral.
-        # Follow up PR to enable fp8 for other MoE models.
+        # Weight scales
         elif "weight_scale" in weight_name:
-            # We have to keep the weight scales of w1 and w3 because
-            # we need to re-quantize w1/w3 weights after weight loading.
-            assert "w1" in weight_name or "w3" in weight_name
-            shard_id = 0 if "w1" in weight_name else 1
-            param_data[expert_id][shard_id] = loaded_weight
+            # If we are in merged column case (gate_up_proj)
+            #   shard_id 0 == gate_proj / w1
+            #   shard_id 2 == up_proj / w3
+            if shard_id == 0 or shard_id == 2:
+                # We have to keep the weight scales of w1 and w3 because
+                # we need to re-quantize w1/w3 weights after weight loading.
+                idx = 0 if shard_id == 0 else 1
+                param_data[expert_id][idx] = loaded_weight
+            # If we are in the row parallel case (down_proj)
+            #   shard_id 1 == down_proj / w2
+            else:
+                param_data[expert_id] = loaded_weight
+        # Weights
         else:
             tp_rank = get_tensor_model_parallel_rank()
             shard_size = self.intermediate_size_per_partition
@@ -188,10 +211,50 @@ class FusedMoE(torch.nn.Module):
             x=hidden_states,
             router_logits=router_logits,
             top_k=self.top_k,
-            renormalize=self.renormalize)
+            renormalize=self.renormalize,
+            use_grouped_topk=self.use_grouped_topk,
+            num_expert_group=self.num_expert_group,
+            topk_group=self.topk_group)
 
         if self.reduce_results and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
         return final_hidden_states
+
+    @classmethod
+    def make_expert_params_mapping(
+            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
+            ckpt_up_proj_name: str,
+            num_experts: int) -> List[Tuple[str, str, int, int]]:
+
+        gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
+        gate_down_up = [
+            ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name
+        ]
+
+        return [
+            # These are the weight scales for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_scale"
+             if weight_name in gate_up else "experts.w2_scale",
+             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id,
+             shard_id) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
+        ] + [
+            # These are the weights for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_weight"
+             if weight_name in gate_up else "experts.w2_weight",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
+            for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
+        ] + [
+            # These are the weight scales for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.a13_scale"
+             if weight_name in gate_up else "experts.a2_scale",
+             f"experts.{expert_id}.{weight_name}.input_scale", expert_id,
+             shard_id) for expert_id in range(num_experts)
+            for shard_id, weight_name in enumerate(gate_down_up)
+        ]
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0c2d2bd3f..5c916c9b4 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -377,7 +377,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
               x: torch.Tensor,
               router_logits: torch.Tensor,
               top_k: int,
-              renormalize: bool = True) -> torch.Tensor:
+              renormalize: bool = True,
+              use_grouped_topk: bool = False,
+              num_expert_group: Optional[int] = None,
+              topk_group: Optional[int] = None) -> torch.Tensor:
 
         return fused_moe(x,
                          layer.w13_weight,
@@ -390,7 +393,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                          w1_scale=layer.w13_scale,
                          w2_scale=layer.w2_scale,
                          a1_scale=layer.a13_scale,
-                         a2_scale=layer.a2_scale)
+                         a2_scale=layer.a2_scale,
+                         use_grouped_topk=use_grouped_topk,
+                         num_expert_group=num_expert_group,
+                         topk_group=topk_group)
 
 
 class Fp8KVCacheMethod(QuantizeMethodBase):
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index fb4097fd1..2d12ceb7f 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -29,11 +29,10 @@ from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
+from vllm.distributed import (get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_experts, grouped_topk
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -91,32 +90,34 @@ class DeepseekV2MoE(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.n_routed_experts = config.n_routed_experts
-        self.top_k = config.num_experts_per_tok
         self.routed_scaling_factor = config.routed_scaling_factor
-        if self.tp_size > self.n_routed_experts:
+        self.n_shared_experts = config.n_shared_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        if self.tp_size > config.n_routed_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.n_routed_experts}.")
-
-        self.experts = nn.ModuleList([
-            DeepseekV2MLP(hidden_size=config.hidden_size,
-                          intermediate_size=config.moe_intermediate_size,
-                          hidden_act=config.hidden_act,
-                          quant_config=quant_config,
-                          reduce_results=False)
-            for idx in range(self.n_routed_experts)
-        ])
-        self.pack_params()
+                f"the number of experts {config.n_routed_experts}.")
+
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+
+        self.experts = FusedMoE(num_experts=config.n_routed_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config,
+                                use_grouped_topk=True,
+                                num_expert_group=config.n_group,
+                                topk_group=config.topk_group)
 
         self.gate = ReplicatedLinear(config.hidden_size,
-                                     self.n_routed_experts,
+                                     config.n_routed_experts,
                                      bias=False,
                                      quant_config=None)
-
         if config.n_shared_experts is not None:
             intermediate_size = (config.moe_intermediate_size *
                                  config.n_shared_experts)
@@ -128,50 +129,21 @@ class DeepseekV2MoE(nn.Module):
                 reduce_results=False,
             )
 
-    def pack_params(self):
-        w1 = []
-        w2 = []
-        for expert in self.experts:
-            w1.append(expert.gate_up_proj.weight)
-            w2.append(expert.down_proj.weight)
-        self.w1 = torch._utils._flatten_dense_tensors(w1)
-        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
-        for data, param in zip(w1s, w1):
-            param.data = data
-        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
-
-        self.w2 = torch._utils._flatten_dense_tensors(w2)
-        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
-        for data, param in zip(w2s, w2):
-            param.data = data
-
-        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
-
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.config.n_shared_experts is not None:
+        if self.n_shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        topk_weights, topk_ids = grouped_topk(
-            hidden_states,
-            router_logits,
-            self.top_k,
-            renormalize=self.config.norm_topk_prob,
-            num_expert_group=self.config.n_group,
-            topk_group=self.config.topk_group)
-        final_hidden_states = fused_experts(
-            hidden_states,
-            self.w1,
-            self.w2,
-            topk_weights,
-            topk_ids,
-            inplace=True) * self.routed_scaling_factor
-        if self.config.n_shared_experts is not None:
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits) * self.routed_scaling_factor
+        if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
-        final_hidden_states = tensor_model_parallel_all_reduce(
-            final_hidden_states)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
 
         return final_hidden_states.view(num_tokens, hidden_dim)
 
@@ -504,34 +476,58 @@ class DeepseekV2ForCausalLM(nn.Module):
             ("gate_up_proj", "up_proj", 1),
         ]
 
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e5bd58a9e..0c456ada6 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -372,31 +372,13 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
             ("qkv_proj", "v_proj", "v"),
         ]
 
-        expert_params_mapping = [
-            # These are the weight scales for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_scale"
-             if weight_name in ["w1", "w3"] else "experts.w2_scale",
-             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id,
-             shard_id) for expert_id in range(self.config.num_local_experts)
-            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
-        ] + [
-            # These are the weights for the experts
-            # (param_name, weight_name, expert_id)
-            ("experts.w13_weight"
-             if weight_name in ["w1", "w3"] else "experts.w2_weight",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
-            for expert_id in range(self.config.num_local_experts)
-            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
-        ] + [
-            # These are the activation scales for the experts
-            # (param_name, weight_name, expert_id)
-            ("experts.a13_scale"
-             if weight_name in ["w1", "w3"] else "experts.a2_scale",
-             f"experts.{expert_id}.{weight_name}.input_scale", expert_id,
-             shard_id) for expert_id in range(self.config.num_local_experts)
-            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
-        ]
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 7b18b5e04..2cc2f1440 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -50,6 +50,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.utils import print_warning_once
 
 
 class Qwen2MoeMLP(nn.Module):
@@ -406,15 +407,13 @@ class Qwen2MoeForCausalLM(nn.Module):
             ("gate_up_proj", "up_proj", 1),
         ]
 
-        expert_params_mapping = [
-            # These are the weights for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_weight" if weight_name in ["gate_proj", "up_proj"]
-             else "experts.w2_weight",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
-            for expert_id in range(self.config.num_experts) for shard_id,
-            weight_name in enumerate(["gate_proj", "down_proj", "up_proj"])
-        ]
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
@@ -461,8 +460,20 @@ class Qwen2MoeForCausalLM(nn.Module):
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
-                    if name not in params_dict:
-                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            print_warning_once(
+                                "Found kv scale in the checkpoint "
+                                f"(e.g. {name}), but not found the expected "
+                                f"name in the model "
+                                f"(e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded.")
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
 
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
-- 
GitLab


From 540c0368b14ddd8d3efac0b182761bf6600f104f Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 14 Jul 2024 13:27:14 +0800
Subject: [PATCH 353/376] [Model] Initialize Fuyu-8B support (#3924)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.rst |   8 +
 examples/fuyu_example.py                |  31 +++
 tests/models/test_fuyu.py               | 142 ++++++++++
 vllm/model_executor/models/__init__.py  |   2 +
 vllm/model_executor/models/fuyu.py      | 328 +++++++++++++++++++++++
 vllm/model_executor/models/persimmon.py | 333 ++++++++++++++++++++++++
 6 files changed, 844 insertions(+)
 create mode 100644 examples/fuyu_example.py
 create mode 100644 tests/models/test_fuyu.py
 create mode 100644 vllm/model_executor/models/fuyu.py
 create mode 100644 vllm/model_executor/models/persimmon.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f56679c3c..50cae1041 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -137,6 +137,10 @@ Decoder-only Language Models
     - Phi-3-Small
     - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
     -
+  * - :code:`PersimmonForCausalLM`
+    - Persimmon
+    - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
+    - 
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
@@ -178,6 +182,10 @@ Vision Language Models
     - Models
     - Example HuggingFace Models
     - :ref:`LoRA <lora>`
+  * - :code:`FuyuForCausalLM`
+    - Fuyu
+    - :code:`adept/fuyu-8b` etc.
+    - 
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py
new file mode 100644
index 000000000..c92b8fb4b
--- /dev/null
+++ b/examples/fuyu_example.py
@@ -0,0 +1,31 @@
+import requests
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+
+
+def run_fuyu():
+    llm = LLM(model="adept/fuyu-8b", max_model_len=4096)
+
+    # single-image prompt
+    prompt = "What is the highest life expectancy at of male?\n"
+    url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
+    image = Image.open(requests.get(url, stream=True).raw)
+    sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        },
+        sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    run_fuyu()
diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py
new file mode 100644
index 000000000..672470acb
--- /dev/null
+++ b/tests/models/test_fuyu.py
@@ -0,0 +1,142 @@
+from typing import List, Optional, Tuple, Type
+
+import pytest
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign": "What's the content of the image?\n",  # noqa: E501
+    "cherry_blossom": "What is the season?\n",
+    "boardwalk": "What's in this image?\n",
+})
+
+models = ["adept/fuyu-8b"]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]]):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=2560,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images)
+            for prompts, vllm_images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.language_model.get_output_embeddings()
+        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
+            for prompts, hf_images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+target_dtype = "half"
+if is_cpu():
+    target_dtype = "bfloat16"
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [0.25],
+        # Single-scale, batched
+        [0.25, 0.25, 0.25],
+        # Multi-scale
+        [0.25, 0.2, 0.15],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 096e3f472..87508a116 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -23,6 +23,7 @@ _GENERATION_MODELS = {
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+    "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
@@ -49,6 +50,7 @@ _GENERATION_MODELS = {
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
+    "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
     "PaliGemmaForConditionalGeneration":
     ("paligemma", "PaliGemmaForConditionalGeneration"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
new file mode 100644
index 000000000..fdea8ee30
--- /dev/null
+++ b/vllm/model_executor/models/fuyu.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Fuyu model."""
+import math
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from PIL import Image
+from transformers import FuyuConfig, FuyuImageProcessor
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.persimmon import PersimmonForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.image import (cached_get_image_processor,
+                                   cached_get_tokenizer)
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+logger = init_logger(__name__)
+
+# Cannot find the following 2 numbers from hf config.
+_IMAGE_TOKEN_ID = 71011
+_NEWLINE_TOKEN_ID = 71019
+
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
+
+
+class FuyuImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """
+    Shape: 
+    (batch_size, num_patches, patch_size_x * patch_size_y * num_channels)
+    """
+
+
+def _calculate_num_image_tokens(
+    height: int,
+    width: int,
+) -> Tuple[int, int]:
+    """
+    calculate number of image tokens needed for a given image size
+    The expected Fuyu image prompts is in format:
+        (image_token * ncols + newline_token) * nrows
+    args:
+        image_size: Tuple[int, int] - (width, height) of the image
+    returns:
+        ncols: int - number of image tokens in x direction
+        nrows: int - number of image tokens in y direction
+    """
+    ncol = math.ceil(width / 30)
+    nrow = math.ceil(height / 30)
+    return ncol, nrow
+
+
+def get_max_fuyu_image_feature_size():
+
+    return _calculate_num_image_tokens(
+        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+    )
+
+
+def get_max_fuyu_image_tokens(ctx: InputContext):
+    ncol, nrow = get_max_fuyu_image_feature_size()
+    return (ncol + 1) * nrow
+
+
+def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int):
+    ncol, nrow = get_max_fuyu_image_feature_size()
+    image_feature_size = get_max_fuyu_image_tokens(ctx)
+
+    token_ids = ([_IMAGE_TOKEN_ID] * ncol + [_NEWLINE_TOKEN_ID]) * nrow
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_fuyu(
+    image_width: int,
+    image_height: int,
+):
+    image = Image.new("RGB", (image_width, image_height), color=0)
+    return {"image": image}
+
+
+def dummy_data_for_fuyu(ctx: InputContext, seq_len: int):
+    seq_data = dummy_seq_data_for_fuyu(ctx, seq_len)
+    mm_data = dummy_image_for_fuyu(MAX_IMAGE_FEATURE_SIZE_WIDTH,
+                                   MAX_IMAGE_FEATURE_SIZE_HEIGHT)
+    return seq_data, mm_data
+
+
+def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
+                           data: Image.Image):
+    image_encoding = image_processor.preprocess(data, return_tensors="pt")
+    batch_images = torch.stack([img[0] for img in image_encoding["images"]
+                                ]).unsqueeze(1)
+    image_unpadded_heights = torch.tensor(
+        image_encoding["image_unpadded_heights"])
+    image_unpadded_widths = torch.tensor(
+        image_encoding["image_unpadded_widths"])
+
+    batch_size = len(image_encoding["images"])
+    image_present = torch.ones(batch_size, 1, 1)
+    model_image_input = image_processor.preprocess_with_tokenizer_info(
+        image_input=batch_images,
+        image_present=image_present,
+        image_unpadded_h=image_unpadded_heights,
+        image_unpadded_w=image_unpadded_widths,
+        image_placeholder_id=_IMAGE_TOKEN_ID,
+        image_newline_id=_NEWLINE_TOKEN_ID,
+        variable_sized=True,
+    )
+    return model_image_input
+
+
+def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    image_data = multi_modal_data["image"]
+    new_multi_modal_data = {}
+    # process image data
+    if isinstance(image_data, Image.Image):
+        # Fuyu's image_processor can also finish token padding
+        image_processor: FuyuImageProcessor = cached_get_image_processor(
+            model_config.model)
+
+        model_image_input = _fuyu_image_preprocess(image_processor, image_data)
+        image_patches = torch.stack([
+            image_patch[0]
+            for image_patch in model_image_input["image_patches"]
+        ])
+        new_multi_modal_data["image"] = image_patches
+
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    # process prompts
+    prompt = llm_inputs["prompt"]
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    tokenizer = cached_get_tokenizer(model_config.model)
+    # dim0 is batch_size, dim1 is subseq_size which will always be 1
+    image_input_ids: List[List[
+        torch.Tensor]] = model_image_input["image_input_ids"]
+    image_input_ids = image_input_ids[0][0].tolist()
+    bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
+    boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
+
+    new_prompt = prompt + "\x04"
+    new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[
+        1:] + boa_token
+
+    return LLMInputs(prompt=new_prompt,
+                     prompt_token_ids=new_prompt_token_ids,
+                     multi_modal_data=new_multi_modal_data)
+
+
+def input_mapper_for_fuyu(ctx: InputContext, data: object):
+    model_config = ctx.model_config
+    if isinstance(data, Image.Image):
+        # Fuyu's image_processor can also finish token padding
+        image_processor: FuyuImageProcessor = cached_get_image_processor(
+            model_config.model)
+
+        model_image_input = _fuyu_image_preprocess(image_processor, data)
+        data = torch.stack([
+            image_patch[0]
+            for image_patch in model_image_input["image_patches"]
+        ])
+
+    # image has been processed with prompt in input processor
+    return MultiModalInputs({"image_patches": data})
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
+class FuyuForCausalLM(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: FuyuConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.image_token_id = _IMAGE_TOKEN_ID
+        self.image_feature_size = config.patch_size**2 * config.num_channels
+
+        self.vision_embed_tokens = ColumnParallelLinear(
+            self.image_feature_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        self.language_model = PersimmonForCausalLM(config,
+                                                   cache_config=cache_config,
+                                                   quant_config=quant_config)
+
+    def _parse_and_validate_image_input(self, **kwargs: object):
+        image_patches = kwargs.pop("image_patches", None)
+
+        if isinstance(image_patches, torch.Tensor):
+            expected_feature_size = self.image_feature_size
+            if image_patches.size(-1) != expected_feature_size:
+                raise ValueError(
+                    f"Expected image patches to have the last dimension of "
+                    f"{expected_feature_size}, got {image_patches.size(-1)}")
+            image_patches = image_patches.to(
+                self.vision_embed_tokens.weight.dtype)
+            return FuyuImagePixelInputs(type="pixel_values",
+                                        data=image_patches)
+        return None
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ):
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            vision_embeddings, _ = self.vision_embed_tokens(
+                image_input["data"])
+            inputs_embeds = self.language_model.model.embed_tokens(input_ids)
+            inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+                                                    vision_embeddings,
+                                                    self.image_token_id)
+
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.language_model.logits_processor(
+            self.language_model.lm_head, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.language_model.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # copy from vllm/model_executor/models/bloom.py
+                # NOTE: Fuyu's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
new file mode 100644
index 000000000..bc38d4421
--- /dev/null
+++ b/vllm/model_executor/models/persimmon.py
@@ -0,0 +1,333 @@
+# coding=utf-8
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only persimmon model compatible with HuggingFace weights."""
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PersimmonConfig
+from transformers.activations import ReLUSquaredActivation
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+
+class PersimmonMLP(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size,
+                                                  config.intermediate_size,
+                                                  quant_config=quant_config)
+        self.dense_4h_to_h = RowParallelLinear(config.intermediate_size,
+                                               config.hidden_size,
+                                               quant_config=quant_config)
+        self.act = ReLUSquaredActivation()
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class PersimmonAttention(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        tensor_parallel_world_size = get_tensor_model_parallel_world_size()
+
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tensor_parallel_world_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.is_causal = True
+
+        assert (self.head_dim * self.total_num_heads) == self.hidden_size
+        assert self.total_num_heads % tensor_parallel_world_size == 0
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.is_qk_layernorm = config.qk_layernorm
+
+        if self.is_qk_layernorm:
+            self.q_layernorm = nn.LayerNorm(self.head_dim)
+            self.k_layernorm = nn.LayerNorm(self.head_dim)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=int(self.partial_rotary_factor * self.head_dim),
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
+        # [seq_length, hidden_size] -> [seq_length, num_heads, head_dim]
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads, self.head_dim)
+
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        # [seq_length, num_heads, head_dim] -> [seq_length, hidden_size]
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        # [seq_length, 3 x hidden_size]
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+
+        if self.is_qk_layernorm:
+            # [seq_length, num_heads, head_dim]
+            q = self._split_heads(q)
+            k = self._split_heads(k)
+
+            q = self.q_layernorm(q)
+            k = self.k_layernorm(k)
+
+            q = self._merge_heads(q)
+            k = self._merge_heads(k)
+
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class PersimmonDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PersimmonAttention(config=config,
+                                            cache_config=cache_config,
+                                            quant_config=quant_config)
+        self.mlp = PersimmonMLP(config, quant_config=quant_config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = hidden_states
+        return outputs
+
+
+class PersimmonModel(nn.Module):
+
+    def __init__(self,
+                 config: PersimmonConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.hidden_size)
+        self.layers = nn.ModuleList([
+            PersimmonDecoderLayer(config,
+                                  cache_config=cache_config,
+                                  quant_config=quant_config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.final_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            hidden_states = self.layers[i](
+                positions,
+                hidden_states,
+                kv_caches[i],
+                attn_metadata,
+            )
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class PersimmonForCausalLM(nn.Module):
+
+    def __init__(self,
+                 config,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.model = PersimmonModel(config,
+                                    cache_config=cache_config,
+                                    quant_config=quant_config)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      bias=False)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # copy from vllm/model_executor/models/bloom.py
+                # NOTE: Persimmon's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
-- 
GitLab


From 6ef3bf912cfb878ff57cf395c4e5908fd9b2a42b Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sun, 14 Jul 2024 02:58:09 -0500
Subject: [PATCH 354/376] Remove unnecessary trailing period in spec_decode.rst
 (#6405)

---
 docs/source/models/spec_decode.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
index 9fb62397b..87a52360c 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -73,5 +73,5 @@ Resources for vLLM contributors
 -------------------------------
 * `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
 * `What is Lookahead Scheduling in vLLM? <https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a>`_
-* `Information on batch expansion. <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
+* `Information on batch expansion <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
 * `Dynamic speculative decoding <https://github.com/vllm-project/vllm/issues/4565>`_
-- 
GitLab


From 9dad5cc85902f419aea5320dd49a827621ed5668 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sun, 14 Jul 2024 09:37:19 -0400
Subject: [PATCH 355/376] [Kernel] Turn off CUTLASS scaled_mm for Ada Lovelace
 (#6384)

---
 ...eta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml |  8 ++++----
 .../configs/Meta-Llama-3-8B-Instruct-FP8.yaml          |  6 +++---
 csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu      | 10 ++++++++--
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
index e40f42a17..374171f1f 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 250 -f 5 -t 1
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.752
+    value: 0.755
   - name: "exact_match,flexible-extract"
-    value: 0.752
-limit: 250
+    value: 0.755
+limit: 1000
 num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
index 7a89e8e0c..dc36b7056 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.756
+    value: 0.753
   - name: "exact_match,flexible-extract"
-    value: 0.752
-limit: 250
+    value: 0.753
+limit: 1000
 num_fewshot: 5
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 81bf2d62d..605166930 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -38,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   if (cuda_device_capability >= 90) {
     return CUDA_VERSION >= 12000;
   } else if (cuda_device_capability >= 89) {
-    return CUDA_VERSION >= 12040;
+    // CUTLASS Kernels have not been tuned for Ada Lovelace systems
+    // and are slower than torch.mm. Return false unconditionally in this case.
+    return false;
+
+    // Once the CUTLASS kernels have been optimized for Lovelace systems,
+    // use the following check:
+    // return CUDA_VERSION >= 12040;
   }
 #endif
 
@@ -98,4 +104,4 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
     TORCH_CHECK(version_num >= 75);
     cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
   }
-}
\ No newline at end of file
+}
-- 
GitLab


From ccd3c045710323102ebd9eab6d2e192c0fd6e509 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 14 Jul 2024 07:16:21 -0700
Subject: [PATCH 356/376] [ci][build] fix commit id (#6420)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml | 4 +++-
 Dockerfile                    | 3 +++
 setup.py                      | 9 +++++++--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c8f53224b..2d5bbbf07 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -194,7 +194,9 @@ steps:
   command: pytest -v -s test_logits_processor.py
 
 - label: Utils Test
-  command: pytest -v -s test_utils.py
+  commands:
+    - pytest -v -s test_utils.py
+    - pytest -v -s test_embedded_commit.py
 
 - label: Worker Test
   mirror_hardwares: [amd]
diff --git a/Dockerfile b/Dockerfile
index 67198e8fd..9ed065deb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -88,6 +88,9 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
+ARG buildkite_commit
+ENV BUILDKITE_COMMIT=${buildkite_commit}
+
 ARG USE_SCCACHE
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
diff --git a/setup.py b/setup.py
index 485cfe36b..f1769a8f1 100644
--- a/setup.py
+++ b/setup.py
@@ -30,8 +30,13 @@ logger = logging.getLogger(__name__)
 
 def embed_commit_hash():
     try:
-        commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
-                                            encoding="utf-8").strip()
+        if "BUILDKITE_COMMIT" in os.environ:
+            # ci build
+            commit_id = os.environ["BUILDKITE_COMMIT"]
+        else:
+            commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
+                                                encoding="utf-8").strip()
+
         commit_contents = f'__commit__ = "{commit_id}"\n'
 
         version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
-- 
GitLab


From 73030b7dae676f730ea43652af056501584ecbfe Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 14 Jul 2024 17:38:42 -0400
Subject: [PATCH 357/376] [ Misc ] Enable Quantizing All Layers of DeekSeekv2
 (#6423)

---
 .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh | 2 +-
 vllm/model_executor/model_loader/weight_utils.py            | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index dbb21be4f..2f04cc128 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray",trust_remote_code=true \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
   --batch_size $BATCH_SIZE
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 943022a3f..c8568b3dc 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -431,6 +431,11 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
 def default_weight_loader(param: torch.Tensor,
                           loaded_weight: torch.Tensor) -> None:
     """Default weight loader."""
+    # If the weight on disk does not have a shape, give it one
+    # (such scales for AutoFp8).
+    if len(loaded_weight.shape) == 0:
+        loaded_weight = loaded_weight.reshape(1)
+
     assert param.size() == loaded_weight.size()
     param.data.copy_(loaded_weight)
 
-- 
GitLab


From dbfe254eda918bc3cc52cf448d518824ad6593b9 Mon Sep 17 00:00:00 2001
From: Ethan Xu <70482605+EthanqX@users.noreply.github.com>
Date: Sun, 14 Jul 2024 15:36:43 -0700
Subject: [PATCH 358/376] [Feature] vLLM CLI (#5090)

Co-authored-by: simon-mo <simon.mo@hey.com>
---
 benchmarks/benchmark_serving.py               |   4 +-
 .../serving/openai_compatible_server.md       |   2 +-
 setup.py                                      |   5 +
 tests/utils.py                                |   6 +-
 vllm/entrypoints/openai/api_server.py         |  78 +++++----
 vllm/entrypoints/openai/cli_args.py           |  10 +-
 vllm/scripts.py                               | 154 ++++++++++++++++++
 7 files changed, 223 insertions(+), 36 deletions(-)
 create mode 100644 vllm/scripts.py

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index b2924b9e8..b625f92d7 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -2,8 +2,8 @@
 
 On the server side, run one of the following commands:
     vLLM OpenAI API server
-    python -m vllm.entrypoints.openai.api_server \
-        --model <your_model> --swap-space 16 \
+    vllm serve <your_model> \
+        --swap-space 16 \
         --disable-log-requests
 
     (TGI backend)
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 6248d8468..092c3c6cb 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -109,7 +109,7 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 
 ```{argparse}
 :module: vllm.entrypoints.openai.cli_args
-:func: make_arg_parser
+:func: create_parser_for_docs
 :prog: -m vllm.entrypoints.openai.api_server
 ```
 
diff --git a/setup.py b/setup.py
index f1769a8f1..72ef26f15 100644
--- a/setup.py
+++ b/setup.py
@@ -488,4 +488,9 @@ setup(
     },
     cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
     package_data=package_data,
+    entry_points={
+        "console_scripts": [
+            "vllm=vllm.scripts:main",
+        ],
+    },
 )
diff --git a/tests/utils.py b/tests/utils.py
index 50f723b0b..8780d45a3 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -14,7 +14,7 @@ import requests
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.utils import get_open_port, is_hip
+from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
 
 if is_hip():
     from amdsmi import (amdsmi_get_gpu_vram_usage,
@@ -57,7 +57,9 @@ class RemoteOpenAIServer:
 
             cli_args = cli_args + ["--port", str(get_open_port())]
 
-        parser = make_arg_parser()
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
         args = parser.parse_args(cli_args)
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6cba356c4..45c634b4a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -8,7 +8,7 @@ from typing import Optional, Set
 
 import fastapi
 import uvicorn
-from fastapi import Request
+from fastapi import APIRouter, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -35,10 +35,14 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
+logger = init_logger(__name__)
+engine: AsyncLLMEngine
+engine_args: AsyncEngineArgs
 openai_serving_chat: OpenAIServingChat
 openai_serving_completion: OpenAIServingCompletion
 openai_serving_embedding: OpenAIServingEmbedding
@@ -64,35 +68,23 @@ async def lifespan(app: fastapi.FastAPI):
     yield
 
 
-app = fastapi.FastAPI(lifespan=lifespan)
-
-
-def parse_args():
-    parser = make_arg_parser()
-    return parser.parse_args()
-
+router = APIRouter()
 
 # Add prometheus asgi middleware to route /metrics requests
 route = Mount("/metrics", make_asgi_app())
 # Workaround for 307 Redirect for /metrics
 route.path_regex = re.compile('^/metrics(?P<path>.*)$')
-app.routes.append(route)
-
-
-@app.exception_handler(RequestValidationError)
-async def validation_exception_handler(_, exc):
-    err = openai_serving_chat.create_error_response(message=str(exc))
-    return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
+router.routes.append(route)
 
 
-@app.get("/health")
+@router.get("/health")
 async def health() -> Response:
     """Health check."""
     await openai_serving_chat.engine.check_health()
     return Response(status_code=200)
 
 
-@app.post("/tokenize")
+@router.post("/tokenize")
 async def tokenize(request: TokenizeRequest):
     generator = await openai_serving_completion.create_tokenize(request)
     if isinstance(generator, ErrorResponse):
@@ -103,7 +95,7 @@ async def tokenize(request: TokenizeRequest):
         return JSONResponse(content=generator.model_dump())
 
 
-@app.post("/detokenize")
+@router.post("/detokenize")
 async def detokenize(request: DetokenizeRequest):
     generator = await openai_serving_completion.create_detokenize(request)
     if isinstance(generator, ErrorResponse):
@@ -114,19 +106,19 @@ async def detokenize(request: DetokenizeRequest):
         return JSONResponse(content=generator.model_dump())
 
 
-@app.get("/v1/models")
+@router.get("/v1/models")
 async def show_available_models():
     models = await openai_serving_completion.show_available_models()
     return JSONResponse(content=models.model_dump())
 
 
-@app.get("/version")
+@router.get("/version")
 async def show_version():
     ver = {"version": VLLM_VERSION}
     return JSONResponse(content=ver)
 
 
-@app.post("/v1/chat/completions")
+@router.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
     generator = await openai_serving_chat.create_chat_completion(
@@ -142,7 +134,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
         return JSONResponse(content=generator.model_dump())
 
 
-@app.post("/v1/completions")
+@router.post("/v1/completions")
 async def create_completion(request: CompletionRequest, raw_request: Request):
     generator = await openai_serving_completion.create_completion(
         request, raw_request)
@@ -156,7 +148,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-@app.post("/v1/embeddings")
+@router.post("/v1/embeddings")
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     generator = await openai_serving_embedding.create_embedding(
         request, raw_request)
@@ -167,8 +159,10 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-if __name__ == "__main__":
-    args = parse_args()
+def build_app(args):
+    app = fastapi.FastAPI(lifespan=lifespan)
+    app.include_router(router)
+    app.root_path = args.root_path
 
     app.add_middleware(
         CORSMiddleware,
@@ -178,6 +172,12 @@ if __name__ == "__main__":
         allow_headers=args.allowed_headers,
     )
 
+    @app.exception_handler(RequestValidationError)
+    async def validation_exception_handler(_, exc):
+        err = openai_serving_chat.create_error_response(message=str(exc))
+        return JSONResponse(err.model_dump(),
+                            status_code=HTTPStatus.BAD_REQUEST)
+
     if token := envs.VLLM_API_KEY or args.api_key:
 
         @app.middleware("http")
@@ -203,6 +203,12 @@ if __name__ == "__main__":
             raise ValueError(f"Invalid middleware {middleware}. "
                              f"Must be a function or a class.")
 
+    return app
+
+
+def run_server(args, llm_engine=None):
+    app = build_app(args)
+
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
@@ -211,10 +217,12 @@ if __name__ == "__main__":
     else:
         served_model_names = [args.model]
 
-    engine_args = AsyncEngineArgs.from_cli_args(args)
+    global engine, engine_args
 
-    engine = AsyncLLMEngine.from_engine_args(
-        engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = (llm_engine
+              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
+                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
 
     event_loop: Optional[asyncio.AbstractEventLoop]
     try:
@@ -230,6 +238,10 @@ if __name__ == "__main__":
         # When using single vLLM without engine_use_ray
         model_config = asyncio.run(engine.get_model_config())
 
+    global openai_serving_chat
+    global openai_serving_completion
+    global openai_serving_embedding
+
     openai_serving_chat = OpenAIServingChat(engine, model_config,
                                             served_model_names,
                                             args.response_role,
@@ -258,3 +270,13 @@ if __name__ == "__main__":
                 ssl_certfile=args.ssl_certfile,
                 ssl_ca_certs=args.ssl_ca_certs,
                 ssl_cert_reqs=args.ssl_cert_reqs)
+
+
+if __name__ == "__main__":
+    # NOTE(simon):
+    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
+    parser = FlexibleArgumentParser(
+        description="vLLM OpenAI-Compatible RESTful API server.")
+    parser = make_arg_parser(parser)
+    args = parser.parse_args()
+    run_server(args)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 81c474ecc..f841633b5 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -34,9 +34,7 @@ class PromptAdapterParserAction(argparse.Action):
         setattr(namespace, self.dest, adapter_list)
 
 
-def make_arg_parser():
-    parser = FlexibleArgumentParser(
-        description="vLLM OpenAI-Compatible RESTful API server.")
+def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument("--host",
                         type=nullable_str,
                         default=None,
@@ -133,3 +131,9 @@ def make_arg_parser():
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     return parser
+
+
+def create_parser_for_docs() -> FlexibleArgumentParser:
+    parser_for_docs = FlexibleArgumentParser(
+        prog="-m vllm.entrypoints.openai.api_server")
+    return make_arg_parser(parser_for_docs)
diff --git a/vllm/scripts.py b/vllm/scripts.py
new file mode 100644
index 000000000..3f334be92
--- /dev/null
+++ b/vllm/scripts.py
@@ -0,0 +1,154 @@
+# The CLI entrypoint to vLLM.
+import argparse
+import os
+import signal
+import sys
+from typing import Optional
+
+from openai import OpenAI
+
+from vllm.entrypoints.openai.api_server import run_server
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+
+
+def registrer_signal_handlers():
+
+    def signal_handler(sig, frame):
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTSTP, signal_handler)
+
+
+def serve(args: argparse.Namespace) -> None:
+    # EngineArgs expects the model name to be passed as --model.
+    args.model = args.model_tag
+
+    run_server(args)
+
+
+def interactive_cli(args: argparse.Namespace) -> None:
+    registrer_signal_handlers()
+
+    base_url = args.url
+    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
+    openai_client = OpenAI(api_key=api_key, base_url=base_url)
+
+    if args.model_name:
+        model_name = args.model_name
+    else:
+        available_models = openai_client.models.list()
+        model_name = available_models.data[0].id
+
+    print(f"Using model: {model_name}")
+
+    if args.command == "complete":
+        complete(model_name, openai_client)
+    elif args.command == "chat":
+        chat(args.system_prompt, model_name, openai_client)
+
+
+def complete(model_name: str, client: OpenAI) -> None:
+    print("Please enter prompt to complete:")
+    while True:
+        input_prompt = input("> ")
+
+        completion = client.completions.create(model=model_name,
+                                               prompt=input_prompt)
+        output = completion.choices[0].text
+        print(output)
+
+
+def chat(system_prompt: Optional[str], model_name: str,
+         client: OpenAI) -> None:
+    conversation = []
+    if system_prompt is not None:
+        conversation.append({"role": "system", "content": system_prompt})
+
+    print("Please enter a message for the chat model:")
+    while True:
+        input_message = input("> ")
+        message = {"role": "user", "content": input_message}
+        conversation.append(message)
+
+        chat_completion = client.chat.completions.create(model=model_name,
+                                                         messages=conversation)
+
+        response_message = chat_completion.choices[0].message
+        output = response_message.content
+
+        conversation.append(response_message)
+        print(output)
+
+
+def _add_query_options(
+        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="url of the running OpenAI-Compatible RESTful API server")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help=("The model name used in prompt completion, default to "
+              "the first model in list models API call."))
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help=(
+            "API key for OpenAI services. If provided, this api key "
+            "will overwrite the api key obtained through environment variables."
+        ))
+    return parser
+
+
+def main():
+    parser = FlexibleArgumentParser(description="vLLM CLI")
+    subparsers = parser.add_subparsers(required=True)
+
+    serve_parser = subparsers.add_parser(
+        "serve",
+        help="Start the vLLM OpenAI Compatible API server",
+        usage="vllm serve <model_tag> [options]")
+    serve_parser.add_argument("model_tag",
+                              type=str,
+                              help="The model tag to serve")
+    serve_parser = make_arg_parser(serve_parser)
+    serve_parser.set_defaults(dispatch_function=serve)
+
+    complete_parser = subparsers.add_parser(
+        "complete",
+        help=("Generate text completions based on the given prompt "
+              "via the running API server"),
+        usage="vllm complete [options]")
+    _add_query_options(complete_parser)
+    complete_parser.set_defaults(dispatch_function=interactive_cli,
+                                 command="complete")
+
+    chat_parser = subparsers.add_parser(
+        "chat",
+        help="Generate chat completions via the running API server",
+        usage="vllm chat [options]")
+    _add_query_options(chat_parser)
+    chat_parser.add_argument(
+        "--system-prompt",
+        type=str,
+        default=None,
+        help=("The system prompt to be added to the chat template, "
+              "used for models that support system prompts."))
+    chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
+
+    args = parser.parse_args()
+    # One of the sub commands should be executed.
+    if hasattr(args, "dispatch_function"):
+        args.dispatch_function(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab


From 61e85dbad87cc1a0b2a838274a758012062747a6 Mon Sep 17 00:00:00 2001
From: Robert Cohn <rscohn2@gmail.com>
Date: Sun, 14 Jul 2024 20:10:11 -0400
Subject: [PATCH 359/376] [Doc] xpu backend requires running setvars.sh (#6393)

---
 docs/source/getting_started/xpu-installation.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
index 4f0d2da25..a0118e20c 100644
--- a/docs/source/getting_started/xpu-installation.rst
+++ b/docs/source/getting_started/xpu-installation.rst
@@ -40,12 +40,13 @@ Quick start using Dockerfile
 Build from source
 -----------------
 
-- First, install required driver and intel OneAPI 2024.1.
+- First, install required driver and intel OneAPI 2024.1 or later.
 
 - Second, install Python packages for vLLM XPU backend building:
 
 .. code-block:: console
 
+    $ source /opt/intel/oneapi/setvars.sh
     $ pip install --upgrade pip
     $ pip install -v -r requirements-xpu.txt 
 
-- 
GitLab


From a754dc2cb964c78fd9da514aa0b87a0241e97d1f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 14 Jul 2024 21:54:46 -0400
Subject: [PATCH 360/376] [CI/Build] Cross python wheel (#6394)

---
 .buildkite/release-pipeline.yaml | 38 ++------------------------------
 Dockerfile                       |  4 ++--
 2 files changed, 4 insertions(+), 38 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 4a35ce925..4fa195113 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,51 +1,17 @@
 steps:
-  - label: "Build wheel default - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" 
+  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       # rename the files to change linux -> manylinux1
       - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
-    matrix:
-      setup:
-        cuda_version:
-          - "12.1.0"
-        python_version:
-          - "3.10"
-          - "3.11"
-
-  - block: "Build wheels full"
-
-  - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
     matrix:
       setup:
         cuda_version:
           - "11.8.0"
           - "12.1.0"
-        python_version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
-          - "3.11"
-      adjustments:
-      - with:
-          cuda_version: "12.1.0"
-          python_version: "3.10"
-        skip: true
-      - with:
-          cuda_version: "12.1.0"
-          python_version: "3.11"
-        skip: true
diff --git a/Dockerfile b/Dockerfile
index 9ed065deb..7fbc168ac 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         && export SCCACHE_REGION=us-west-2 \
         && export CMAKE_BUILD_TYPE=Release \
         && sccache --show-stats \
-        && python3 setup.py bdist_wheel --dist-dir=dist \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
         && sccache --show-stats; \
     fi
 
@@ -112,7 +112,7 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     if [ "$USE_SCCACHE" != "1" ]; then \
-        python3 setup.py bdist_wheel --dist-dir=dist; \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
 # check the size of the wheel, we cannot upload wheels larger than 100MB
-- 
GitLab


From ccb20db8bd2abb302a8dc0fbc0901f284df9b3f5 Mon Sep 17 00:00:00 2001
From: Fish <45708320+lxline@users.noreply.github.com>
Date: Mon, 15 Jul 2024 10:27:01 +0800
Subject: [PATCH 361/376] [Bugfix] Benchmark serving script used global
 parameter 'args' in function 'sample_random_requests' (#6428)

---
 benchmarks/benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index b625f92d7..fc0dbf77f 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -203,7 +203,7 @@ def sample_random_requests(
     )
     offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
     input_requests = []
-    for i in range(args.num_prompts):
+    for i in range(num_prompts):
         prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
                                    for j in range(input_lens[i])])
         input_requests.append(
-- 
GitLab


From 32c9d7f7650842cc20b2e66a4125ffe126619c50 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 14 Jul 2024 19:37:35 -0700
Subject: [PATCH 362/376] Report usage for beam search (#6404)

---
 vllm/sampling_params.py |  5 +++++
 vllm/usage/usage_lib.py | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 90f0944a7..ebe5e0fd3 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -189,6 +189,10 @@ class SamplingParams:
 
         self._verify_args()
         if self.use_beam_search:
+            # Lazy import to avoid circular imports.
+            from vllm.usage.usage_lib import set_runtime_usage_data
+            set_runtime_usage_data("use_beam_search", True)
+
             if not envs.VLLM_NO_DEPRECATION_WARNING:
                 logger.warning(
                     "[IMPORTANT] We plan to discontinue the support for beam "
@@ -196,6 +200,7 @@ class SamplingParams:
                     "https://github.com/vllm-project/vllm/issues/6226 for "
                     "more information. Set VLLM_NO_DEPRECATION_WARNING=1 to "
                     "suppress this warning.")
+
             self._verify_beam_search()
         else:
             self._verify_non_beam_search()
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index afb3007a5..6907d8b9b 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -7,7 +7,7 @@ import time
 from enum import Enum
 from pathlib import Path
 from threading import Thread
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 from uuid import uuid4
 
 import cpuinfo
@@ -25,6 +25,13 @@ _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
 _USAGE_STATS_ENABLED = None
 _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
 
+_GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {}
+
+
+def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
+    """Set global usage data that will be sent with every usage heartbeat."""
+    _GLOBAL_RUNTIME_DATA[key] = value
+
 
 def is_usage_stats_enabled():
     """Determine whether or not we can send usage stats to the server.
@@ -187,7 +194,11 @@ class UsageMessage:
         """
         while True:
             time.sleep(600)
-            data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
+            data = {
+                "uuid": self.uuid,
+                "log_time": _get_current_timestamp_ns(),
+            }
+            data.update(_GLOBAL_RUNTIME_DATA)
 
             self._write_to_file(data)
             self._send_to_server(data)
-- 
GitLab


From 9bfece89fdbe745730c6bb510ba043c698f79870 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 14 Jul 2024 20:36:16 -0700
Subject: [PATCH 363/376] Add FUNDING.yml (#6435)

---
 .github/FUNDING.yml | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .github/FUNDING.yml

diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 000000000..71f4e5201
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,2 @@
+github: [vllm-project]
+open_collective: [vllm]
-- 
GitLab


From b47008b4d29acb39503a65991c75a68361abc0b0 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Sun, 14 Jul 2024 21:06:09 -0700
Subject: [PATCH 364/376] [BugFix] BatchResponseData body should be optional
 (#6345)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/entrypoints/openai/test_run_batch.py | 3 ++-
 vllm/entrypoints/openai/protocol.py        | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 5de28513c..b25e2a26e 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -6,7 +6,8 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
 
 # ruff: noqa: E501
 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
 
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 881e2675c..b3f0aae6d 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -716,7 +716,7 @@ class BatchResponseData(OpenAIBaseModel):
     request_id: str
 
     # The body of the response.
-    body: Union[ChatCompletionResponse, ]
+    body: Optional[ChatCompletionResponse] = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
-- 
GitLab


From 44874a0bf970ae55c487a1dc09b25bd308872f7c Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Mon, 15 Jul 2024 12:16:51 +0800
Subject: [PATCH 365/376] [Doc] add env docs for flashinfer backend (#6437)

---
 vllm/envs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 5b4a2010d..85d60f324 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -206,6 +206,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     # - "FLASH_ATTN": use FlashAttention
     # - "XFORMERS": use XFormers
     # - "ROCM_FLASH": use ROCmFlashAttention
+    # - "FLASHINFER": use flashinfer
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
-- 
GitLab


From 69672f116cf83dbcfd2d470a959dfe123df4d301 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 14 Jul 2024 21:20:51 -0700
Subject: [PATCH 366/376] [core][distributed] simplify code to support pipeline
 parallel (#6406)

---
 .buildkite/test-pipeline.yaml                 |  4 +-
 .../test_basic_correctness.py                 | 11 +++-
 vllm/model_executor/models/gpt2.py            | 47 +++++++---------
 vllm/model_executor/models/llama.py           | 50 ++++++++---------
 vllm/model_executor/models/utils.py           | 56 +++++++++++++++++++
 5 files changed, 107 insertions(+), 61 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2d5bbbf07..4019cc00f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -46,9 +46,7 @@ steps:
   fast_check: true
   commands:
   - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASHINFER pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index d3e74a4f8..ec7c2ba3e 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -28,10 +28,8 @@ def test_vllm_gc_ed():
     assert weak_llm() is None
 
 
-@pytest.mark.skipif(is_hip()
-                    and os.getenv("VLLM_ATTENTION_BACKEND") == "FLASHINFER",
-                    reason="Flashinfer does not support ROCm/HIP.")
 @pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False, True])
@@ -40,10 +38,17 @@ def test_models(
     vllm_runner,
     example_prompts,
     model: str,
+    backend: str,
     dtype: str,
     max_tokens: int,
     enforce_eager: bool,
 ) -> None:
+
+    if backend == "FLASHINFER" and is_hip():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index be19f4ba8..d309a2b27 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -27,7 +27,6 @@ from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_world_size)
-from vllm.distributed.utils import get_pp_indices
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
@@ -42,6 +41,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
+from .utils import is_pp_missing_parameter, make_layers
+
 
 class GPT2Attention(nn.Module):
 
@@ -183,18 +184,9 @@ class GPT2Model(nn.Module):
         self.embed_dim = config.hidden_size
         self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-        self.start_layer, self.end_layer = get_pp_indices(
+        self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            get_pp_group().rank_in_group,
-            get_pp_group().world_size)
-        self.h = nn.ModuleList(
-            [nn.Identity() for _ in range(self.start_layer)] + [
-                GPT2Block(config, cache_config, quant_config)
-                for _ in range(self.start_layer, self.end_layer)
-            ] + [
-                nn.Identity()
-                for _ in range(self.end_layer, config.num_hidden_layers)
-            ])
+            lambda: GPT2Block(config, cache_config, quant_config))
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
     def forward(
@@ -291,19 +283,20 @@ class GPT2LMHeadModel(nn.Module):
                 continue
             if not name.startswith("transformer."):
                 name = "transformer." + name
-            try:
-                param = params_dict[name]
-                # The HF's GPT-2 implementation uses Conv1D instead of Linear.
-                # Because of this, we need to transpose the weights.
-                # Note(zhuohan): the logic below might break quantized models.
-                for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
-                    if conv1d_weight_name not in name:
-                        continue
-                    if not name.endswith(".weight"):
-                        continue
-                    loaded_weight = loaded_weight.t()
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            except KeyError:
+
+            if is_pp_missing_parameter(name, self):
                 continue
+
+            param = params_dict[name]
+            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+            # Because of this, we need to transpose the weights.
+            # Note(zhuohan): the logic below might break quantized models.
+            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+                if conv1d_weight_name not in name:
+                    continue
+                if not name.endswith(".weight"):
+                    continue
+                loaded_weight = loaded_weight.t()
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 77edcd740..a777d1fbf 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -29,8 +29,7 @@ from transformers import LlamaConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_pp_group, get_pp_indices,
-                              get_tensor_model_parallel_rank,
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -51,6 +50,7 @@ from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import is_hip, print_warning_once
 
 from .interfaces import SupportsLoRA
+from .utils import is_pp_missing_parameter, make_layers
 
 
 class LlamaMLP(nn.Module):
@@ -262,20 +262,11 @@ class LlamaModel(nn.Module):
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
-        self.start_layer, self.end_layer = get_pp_indices(
+        self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            get_pp_group().rank_in_group,
-            get_pp_group().world_size)
-        self.layers = nn.ModuleList(
-            [nn.Identity() for _ in range(self.start_layer)] + [
-                LlamaDecoderLayer(config=config,
-                                  cache_config=cache_config,
-                                  quant_config=quant_config)
-                for _ in range(self.start_layer, self.end_layer)
-            ] + [
-                nn.Identity()
-                for _ in range(self.end_layer, config.num_hidden_layers)
-            ])
+            lambda: LlamaDecoderLayer(config=config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config))
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -455,12 +446,14 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                try:
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param, loaded_weight, shard_id)
-                except KeyError:
-                    pass
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
@@ -479,13 +472,14 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                         continue
                     else:
                         name = remapped_kv_scale_name
-                try:
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-                except KeyError:
-                    pass
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index ef2562b07..a0d2a0286 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,3 +1,5 @@
+from typing import Callable, Dict, List, Tuple
+
 import torch
 
 from vllm.multimodal import BatchedTensors
@@ -39,3 +41,57 @@ def merge_vision_embeddings(input_ids: torch.Tensor,
         inputs_embeds[mask] = torch.cat(vision_embeddings)
 
     return inputs_embeds
+
+
+class PPMissingLayer(torch.nn.Identity):
+    """
+    A placeholder layer for missing layers in a pipeline parallel model.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+
+def make_layers(
+    num_hidden_layers: int, layer_fn: Callable[[], torch.nn.Module]
+) -> Tuple[int, int, torch.nn.ModuleList]:
+    """Make a list of layers with the given layer function, taking
+    pipeline parallelism into account.
+    """
+    from vllm.distributed.parallel_state import get_pp_group
+    from vllm.distributed.utils import get_pp_indices
+    start_layer, end_layer = get_pp_indices(num_hidden_layers,
+                                            get_pp_group().rank_in_group,
+                                            get_pp_group().world_size)
+    modules = torch.nn.ModuleList(
+        [PPMissingLayer() for _ in range(start_layer)] +
+        [layer_fn() for _ in range(start_layer, end_layer)] +
+        [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
+    return start_layer, end_layer, modules
+
+
+# NOTE: don't use lru_cache here because it can prevent garbage collection
+_model_to_pp_missing_layer_names: Dict[int, List[str]] = {}
+
+
+def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]:
+    """Get the names of the missing layers in a pipeline parallel model."""
+    model_id = id(model)
+    if model_id in _model_to_pp_missing_layer_names:
+        return _model_to_pp_missing_layer_names[model_id]
+
+    missing_layer_names = []
+    for name, module in model.named_modules():
+        if isinstance(module, PPMissingLayer):
+            missing_layer_names.append(name)
+    _model_to_pp_missing_layer_names[model_id] = missing_layer_names
+
+    return missing_layer_names
+
+
+def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool:
+    """Check if a parameter is missing in a pipeline parallel model."""
+    for missing_layer_name in get_pp_missing_layer_names(model):
+        if name.startswith(missing_layer_name):
+            return True
+    return False
-- 
GitLab


From de199163140974876d4416e936934ad621b2eb0c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 15 Jul 2024 13:39:15 +0800
Subject: [PATCH 367/376] [Bugfix] Convert image to RGB by default (#6430)

---
 vllm/multimodal/utils.py | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index e55b8bbfd..8691a6134 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -35,8 +35,12 @@ def _load_image_from_data_url(image_url: str):
     return load_image_from_base64(image_base64)
 
 
-def fetch_image(image_url: str) -> Image.Image:
-    """Load PIL image from a url or base64 encoded openai GPT4V format"""
+def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image:
+    """
+    Load a PIL image from a HTTP or base64 data URL.
+
+    By default, the image is converted into RGB format.
+    """
     if image_url.startswith('http'):
         _validate_remote_url(image_url, name="image_url")
 
@@ -53,7 +57,7 @@ def fetch_image(image_url: str) -> Image.Image:
         raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
                          "with either 'data:image' or 'http'.")
 
-    return image
+    return image.convert(image_mode)
 
 
 class ImageFetchAiohttp:
@@ -70,8 +74,17 @@ class ImageFetchAiohttp:
         return cls.aiohttp_client
 
     @classmethod
-    async def fetch_image(cls, image_url: str) -> Image.Image:
-        """Load PIL image from a url or base64 encoded openai GPT4V format"""
+    async def fetch_image(
+        cls,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Asynchronously load a PIL image from a HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
 
         if image_url.startswith('http'):
             _validate_remote_url(image_url, name="image_url")
@@ -91,7 +104,7 @@ class ImageFetchAiohttp:
                 "Invalid 'image_url': A valid 'image_url' must start "
                 "with either 'data:image' or 'http'.")
 
-        return image
+        return image.convert(image_mode)
 
 
 async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
@@ -99,12 +112,19 @@ async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
     return {"image": image}
 
 
-def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
-    """Encode a pillow image to base64 format."""
+def encode_image_base64(
+    image: Image.Image,
+    *,
+    image_mode: str = "RGB",
+    format: str = "JPEG",
+) -> str:
+    """
+    Encode a pillow image to base64 format.
 
+    By default, the image is converted into RGB format before being encoded.
+    """
     buffered = BytesIO()
-    if format == 'JPEG':
-        image = image.convert('RGB')
+    image = image.convert(image_mode)
     image.save(buffered, format)
     return base64.b64encode(buffered.getvalue()).decode('utf-8')
 
-- 
GitLab


From 22e79ee8f3930c39f40f6a1529e41594a607c6b4 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 14 Jul 2024 23:33:25 -0700
Subject: [PATCH 368/376] [doc][misc] doc update (#6439)

---
 docs/source/getting_started/debugging.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 4cd34769e..0d03fe93a 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -50,6 +50,8 @@ Here are some common issues that can cause hangs:
     value = cpu_data.mean().item()
     assert value == world_size, f"Expected {world_size}, got {value}"
 
+    print("sanity check is successful!")
+
 .. tip::
 
     Save the script as ``test.py``.
@@ -62,4 +64,6 @@ Here are some common issues that can cause hangs:
     - is reachable from all nodes
     - is set before running the script.
 
+    If the script runs successfully, you should see the message ``sanity check is successful!``.
+
 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
-- 
GitLab


From 6ae1597ddf5ac51a6eae6b012c5399de22cb9d28 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 15 Jul 2024 02:29:51 -0700
Subject: [PATCH 369/376] [VLM] Minor space optimization for `ClipVisionModel`
 (#6436)

---
 vllm/model_executor/models/clip.py       | 46 +++++++++++++-----------
 vllm/model_executor/models/llava.py      | 16 ++++++---
 vllm/model_executor/models/llava_next.py | 16 ++++++---
 vllm/model_executor/models/phi3v.py      | 27 ++++++++------
 4 files changed, 66 insertions(+), 39 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index d8fbf796b..b4f628061 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -214,22 +214,24 @@ class CLIPEncoder(nn.Module):
 
     def __init__(self,
                  config: CLIPVisionConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
         super().__init__()
         self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
         self.layers = nn.ModuleList([
             CLIPEncoderLayer(config=config, quant_config=quant_config)
-            for _ in range(config.num_hidden_layers)
+            for _ in range(num_hidden_layers)
         ])
 
-    def forward(self,
-                inputs_embeds: torch.Tensor,
-                vision_feature_layer: int = -1):
+    def forward(self, inputs_embeds: torch.Tensor):
 
-        # Encoder forward pass only up to the required layer
-        num_layer = len(self.layers) + vision_feature_layer + 1
         hidden_states = inputs_embeds
-        for encoder_layer in self.layers[:num_layer]:
+        for encoder_layer in self.layers:
             hidden_states = encoder_layer(hidden_states)
 
         return hidden_states
@@ -239,7 +241,8 @@ class CLIPVisionTransformer(nn.Module):
 
     def __init__(self,
                  config: CLIPVisionConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
@@ -249,18 +252,19 @@ class CLIPVisionTransformer(nn.Module):
         # NOTE: This typo of "layrnorm" is not fixed on purpose to match
         # the original transformers code and name of the model weights.
         self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.encoder = CLIPEncoder(config=config, quant_config=quant_config)
+        self.encoder = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override)
 
     def forward(
         self,
         pixel_values: torch.Tensor,
-        vision_feature_layer: int = -1,
     ) -> torch.Tensor:
 
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layrnorm(hidden_states)
-        hidden_states = self.encoder(inputs_embeds=hidden_states,
-                                     vision_feature_layer=vision_feature_layer)
+        hidden_states = self.encoder(inputs_embeds=hidden_states)
 
         return hidden_states
 
@@ -272,17 +276,17 @@ class CLIPVisionModel(nn.Module):
 
     def __init__(self,
                  config: CLIPVisionConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
         super().__init__()
-        self.vision_model = CLIPVisionTransformer(config=config,
-                                                  quant_config=quant_config)
+        self.vision_model = CLIPVisionTransformer(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override)
 
-    def forward(self,
-                pixel_values: Optional[torch.Tensor] = None,
-                vision_feature_layer: int = -1):
+    def forward(self, pixel_values: Optional[torch.Tensor] = None):
 
-        return self.vision_model(pixel_values=pixel_values,
-                                 vision_feature_layer=vision_feature_layer)
+        return self.vision_model(pixel_values=pixel_values)
 
     @property
     def device(self):
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 250d39687..b5dddd519 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -128,8 +128,17 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
         self.config = config
         self.multimodal_config = multimodal_config
 
+        # Initialize the vision tower only up to the required feature layer
+        vision_feature_layer = config.vision_feature_layer
+        if vision_feature_layer < 0:
+            num_hidden_layers = config.vision_config.num_hidden_layers \
+                + vision_feature_layer + 1
+        else:
+            num_hidden_layers = vision_feature_layer + 1
+
         # TODO: Optionally initializes this for supporting embeddings.
-        self.vision_tower = CLIPVisionModel(config.vision_config)
+        self.vision_tower = CLIPVisionModel(
+            config.vision_config, num_hidden_layers_override=num_hidden_layers)
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
@@ -193,8 +202,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values,
-                                      self.config.vision_feature_layer)
+        image_features = vision_tower(pixel_values)
 
         return self._select_image_features(
             image_features,
@@ -333,7 +341,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
                     break
                 else:
                     use_default_weight_loading = True
-            if use_default_weight_loading:
+            if use_default_weight_loading and name in params_dict:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 9369ec89f..0c89eed88 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -222,8 +222,17 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
         self.config = config
         self.multimodal_config = multimodal_config
 
+        # Initialize the vision tower only up to the required feature layer
+        vision_feature_layer = config.vision_feature_layer
+        if vision_feature_layer < 0:
+            num_hidden_layers = config.vision_config.num_hidden_layers \
+                + vision_feature_layer + 1
+        else:
+            num_hidden_layers = vision_feature_layer + 1
+
         # TODO: Optionally initializes this for supporting embeddings.
-        self.vision_tower = CLIPVisionModel(config=config.vision_config)
+        self.vision_tower = CLIPVisionModel(
+            config.vision_config, num_hidden_layers_override=num_hidden_layers)
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
@@ -312,8 +321,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values,
-                                      self.config.vision_feature_layer)
+        image_features = vision_tower(pixel_values)
 
         return self._select_image_features(
             image_features,
@@ -561,7 +569,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
                     break
                 else:
                     use_default_weight_loading = True
-            if use_default_weight_loading:
+            if use_default_weight_loading and name in params_dict:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 1c6bd106b..8b2c42528 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -80,13 +80,11 @@ class Phi3ImageEmbeddingBase(nn.Module):
 
     def get_img_features(self,
                          img_embeds: torch.FloatTensor) -> torch.FloatTensor:
-        LAYER_IDX = self.layer_idx
         TYPE_FEATURE = self.type_feature
 
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the img_processor
-        img_feature = self.img_processor(img_embeds,
-                                         vision_feature_layer=LAYER_IDX)
+        img_feature = self.img_processor(img_embeds)
 
         if TYPE_FEATURE == "patch":
             patch_feature = img_feature[:, 1:]
@@ -111,7 +109,17 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
             config, 'n_embd') else config.hidden_size
 
         clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
-        self.img_processor = CLIPVisionModel(clip_config)
+        self.layer_idx = config.img_processor.get('layer_idx', -2)
+
+        # Initialize the CLIP only up to the required feature layer
+        if self.layer_idx < 0:
+            num_hidden_layers = clip_config.num_hidden_layers + \
+                self.layer_idx + 1
+        else:
+            num_hidden_layers = self.layer_idx + 1
+
+        self.img_processor = CLIPVisionModel(
+            clip_config, num_hidden_layers_override=num_hidden_layers)
         image_dim_out = config.img_processor['image_dim_out']
         self.num_img_tokens = config.img_processor['num_img_tokens']
 
@@ -142,8 +150,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
         self.img_projection = nn.Sequential(*layers)
 
         self.vocab_size = config.vocab_size
-
-        self.layer_idx = config.img_processor.get('layer_idx', -2)
         self.type_feature = config.img_processor.get('type_feature', 'patch')
 
     def forward(self, input_ids: torch.LongTensor,
@@ -588,7 +594,8 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
-- 
GitLab


From 94b82e8c18f0d38d85171cc8667f763c8078a835 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 15 Jul 2024 09:45:51 -0700
Subject: [PATCH 370/376] [doc][distributed] add suggestion for distributed
 inference (#6418)

---
 docs/source/serving/distributed_serving.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 3c58ed295..2dfb83f16 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -1,5 +1,21 @@
 .. _distributed_serving:
 
+How to decide the distributed inference strategy?
+=================================================
+
+Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
+
+- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+
+In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
+
+After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
+
+.. note::
+    There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
+
 Distributed Inference and Serving
 =================================
 
-- 
GitLab


From c8fd97f26d05aff5a4603177c75aaccf4e6de11b Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 15 Jul 2024 13:05:52 -0400
Subject: [PATCH 371/376] [Kernel] Use CUTLASS kernels for the FP8 layers with
 Bias (#6270)

---
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 30a82e1b5..f290a6830 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -112,7 +112,7 @@ def apply_fp8_linear(
     #   If dynamic, layer.input_scale is None and x_scale computed from x.
     #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
-    if bias is None and cutlass_fp8_supported:
+    if cutlass_fp8_supported:
         qinput, x_scale = ops.scaled_fp8_quant(input, input_scale)
 
         # Fused GEMM_DQ
@@ -120,7 +120,8 @@ def apply_fp8_linear(
                                        weight,
                                        out_dtype=input.dtype,
                                        scale_a=x_scale,
-                                       scale_b=weight_scale)
+                                       scale_b=weight_scale,
+                                       bias=bias)
 
     else:
         qinput, x_scale = ops.scaled_fp8_quant(input,
-- 
GitLab


From a63a4c634174de7a6b966dd6996f9f8d2ae86827 Mon Sep 17 00:00:00 2001
From: Pernekhan Utemuratov <bestkhang@gmail.com>
Date: Mon, 15 Jul 2024 10:10:26 -0700
Subject: [PATCH 372/376] [Misc] Use 0.0.9 version for flashinfer (#6447)

Co-authored-by: Pernekhan Utemuratov <pernekhan@deepinfra.com>
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 7fbc168ac..89d9be0e8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -172,7 +172,7 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb
     python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
 
 
-- 
GitLab


From eaec4b915347d839b0e99bcd57475730a715492c Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Mon, 15 Jul 2024 19:12:47 +0200
Subject: [PATCH 373/376] [Bugfix] Add custom Triton cache manager to resolve
 MoE MP issue  (#6140)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Chih-Chieh-Yang <chih.chieh.yang@ibm.com>
---
 vllm/executor/multiproc_gpu_executor.py   |  5 +++
 vllm/triton_utils/__init__.py             |  6 +++
 vllm/triton_utils/custom_cache_manager.py | 53 +++++++++++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 vllm/triton_utils/__init__.py
 create mode 100644 vllm/triton_utils/custom_cache_manager.py

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index dcde27973..a0e248b2e 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -9,6 +9,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.triton_utils import maybe_set_triton_cache_manager
 from vllm.utils import (cuda_device_count_stateless,
                         error_on_invalid_device_count_status,
                         get_distributed_init_method, get_open_port,
@@ -42,6 +43,10 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
         if "OMP_NUM_THREADS" not in os.environ:
             os.environ["OMP_NUM_THREADS"] = "1"
 
+        # workaround for https://github.com/vllm-project/vllm/issues/6103
+        if world_size > 1:
+            maybe_set_triton_cache_manager()
+
         assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")
 
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
new file mode 100644
index 000000000..09843e5d1
--- /dev/null
+++ b/vllm/triton_utils/__init__.py
@@ -0,0 +1,6 @@
+from vllm.triton_utils.custom_cache_manager import (
+    maybe_set_triton_cache_manager)
+
+__all__ = [
+    "maybe_set_triton_cache_manager",
+]
diff --git a/vllm/triton_utils/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py
new file mode 100644
index 000000000..17039d7ba
--- /dev/null
+++ b/vllm/triton_utils/custom_cache_manager.py
@@ -0,0 +1,53 @@
+import os
+
+from triton.runtime.cache import (FileCacheManager, default_cache_dir,
+                                  default_dump_dir, default_override_dir)
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def maybe_set_triton_cache_manager() -> None:
+    """Set environment variable to tell Triton to use a
+    custom cache manager"""
+    cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
+    if cache_manger is None:
+        manager = "vllm.triton_utils.custom_cache_manager:CustomCacheManager"
+        logger.info("Setting Triton cache manager to: %s", manager)
+        os.environ["TRITON_CACHE_MANAGER"] = manager
+
+
+class CustomCacheManager(FileCacheManager):
+    """Re-implements Triton's cache manager, ensuring that a
+    unique cache directory is created for each process. This is
+    needed to avoid collisions when running with tp>1 and
+    using multi-processing as the distributed backend.
+
+    Note this issue was fixed by triton-lang/triton/pull/4295,
+    but the fix is not yet included in triton==v3.0.0. However,
+    it should be included in the subsequent version.
+    """
+
+    def __init__(self, key, override=False, dump=False):
+        self.key = key
+        self.lock_path = None
+        if dump:
+            self.cache_dir = default_dump_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+            self.lock_path = os.path.join(self.cache_dir, "lock")
+            os.makedirs(self.cache_dir, exist_ok=True)
+        elif override:
+            self.cache_dir = default_override_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+        else:
+            # create cache directory if it doesn't exist
+            self.cache_dir = os.getenv("TRITON_CACHE_DIR",
+                                       "").strip() or default_cache_dir()
+            if self.cache_dir:
+                self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
+                self.cache_dir = os.path.join(self.cache_dir, self.key)
+                self.lock_path = os.path.join(self.cache_dir, "lock")
+                os.makedirs(self.cache_dir, exist_ok=True)
+            else:
+                raise RuntimeError("Could not create or locate cache dir")
-- 
GitLab


From 4ef95b0f0677f95d8181837bbeebca7fca5a2bb2 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Mon, 15 Jul 2024 19:14:49 +0200
Subject: [PATCH 374/376] [Bugfix] use float32 precision in
 samplers/test_logprobs.py for comparing with HF  (#6409)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/samplers/test_logprobs.py      | 3 ++-
 vllm/attention/ops/prefix_prefill.py | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 02a953da0..f7bcd4c85 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -11,7 +11,8 @@ MODELS = ["facebook/opt-125m"]
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype",
+                         ["float"])  # needed for comparing logprobs with HF
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
 @pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
 @pytest.mark.parametrize("detokenize", [True, False])
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 70b544b60..4577d84db 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -687,6 +687,12 @@ if triton.__version__ >= "2.1.0":
 
         cap = current_platform.get_device_capability()
         BLOCK = 128 if cap[0] >= 8 else 64
+
+        # need to reduce num. blocks when using fp32
+        # due to increased use of GPU shared memory
+        if q.dtype is torch.float32:
+            BLOCK = BLOCK // 2
+
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         assert Lq == Lk and Lk == Lv
-- 
GitLab


From 64fdc08c72f1ba923d7a4f76858fcad3551282a5 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 15 Jul 2024 10:27:40 -0700
Subject: [PATCH 375/376] bump version to v0.5.2 (#6433)

---
 vllm/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/version.py b/vllm/version.py
index 309f97954..94333a8fa 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -9,4 +9,4 @@ except Exception as e:
                   stacklevel=2)
     __commit__ = "COMMIT_HASH_PLACEHOLDER"
 
-__version__ = "0.5.1"
+__version__ = "0.5.2"
-- 
GitLab


From 4cf256ae7f8b0be8f06f6b85821e55d4f5bdaa13 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 15 Jul 2024 10:32:35 -0700
Subject: [PATCH 376/376] [misc][distributed] fix pp missing layer condition
 (#6446)

---
 vllm/model_executor/models/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index a0d2a0286..c135b2035 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -83,7 +83,10 @@ def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]:
     missing_layer_names = []
     for name, module in model.named_modules():
         if isinstance(module, PPMissingLayer):
-            missing_layer_names.append(name)
+            # NOTE: the trailing dot is used to match the prefix of the layer.
+            # without the dot, we could match a layer that is not missing,
+            # e.g., 'encoder.layer.1' would match 'encoder.layer.11'
+            missing_layer_names.append(name + '.')
     _model_to_pp_missing_layer_names[model_id] = missing_layer_names
 
     return missing_layer_names
-- 
GitLab