[Bugfix] Enforce no chunked prefill for embedding models (#10470)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Bugfix] Enforce no chunked prefill for embedding models (#10470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
b4be5a8a · Cyrus Leung · GitHub · ad44437b · b4be5a8a · b4be5a8a
Unverified Commit b4be5a8a authored Nov 20, 2024 by Cyrus Leung Committed by GitHub Nov 20, 2024
Show whitespace changes
Inline Side-by-side

Showing with 60 additions and 15 deletions

docs/source/serving/compatibility_matrix.rst docs/source/serving/compatibility_matrix.rst +55 -14

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +5 -1

No files found.
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -39,12 +39,13 @@ Feature x Feature
     - :abbr:`prmpt adptr (Prompt Adapter)`
     - :ref:`SD <spec_decode>`
     - CUDA graph
+     - :abbr:`emd (Embedding Models)`
     - :abbr:`enc-dec (Encoder-Decoder Models)`
     - :abbr:`logP (Logprobs)`
     - :abbr:`prmpt logP (Prompt Logprobs)`
     - :abbr:`async output (Async Output Processing)`
     - multi-step
-     - :abbr:`MM (Multimodal)`
+     - :abbr:`mm (Multimodal)`
     - best-of
     - beam-search
     - :abbr:`guided dec (Guided Decoding)`
@@ -64,6 +65,7 @@ Feature x Feature
     - 
     - 
     - 
+     - 
   * - :ref:`APC <apc>`
     - ✅
     - 
@@ -80,6 +82,7 @@ Feature x Feature
     - 
     - 
     - 
+     - 
   * - :ref:`LoRA <lora>`
     - `✗ <https://github.com/vllm-project/vllm/pull/9057>`__ 
     - ✅
@@ -96,6 +99,7 @@ Feature x Feature
     - 
     - 
     - 
+     - 
   * - :abbr:`prmpt adptr (Prompt Adapter)`
     - ✅
     - ✅
@@ -112,6 +116,7 @@ Feature x Feature
     - 
     - 
     - 
+     - 
   * - :ref:`SD <spec_decode>`
     - ✗
     - ✅
@@ -128,6 +133,7 @@ Feature x Feature
     - 
     - 
     - 
+     - 
   * - CUDA graph
     - ✅
     - ✅
@@ -144,6 +150,24 @@ Feature x Feature
     - 
     - 
     - 
+     - 
+   * - :abbr:`emd (Embedding Models)`
+     - ✗
+     - ✗
+     - ✗ 
+     - ✗
+     - ✗
+     - ✗
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
   * - :abbr:`enc-dec (Encoder-Decoder Models)`
     - ✗
     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
@@ -151,6 +175,7 @@ Feature x Feature
     - ✗
     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
     - ✅
+     - ✅
     - 
     - 
     - 
@@ -167,6 +192,7 @@ Feature x Feature
     - ✅
     - ✅
     - ✅ 
+     - ✗
     - ✅
     - 
     - 
@@ -183,6 +209,7 @@ Feature x Feature
     - ✅
     - `✗ <https://github.com/vllm-project/vllm/pull/8199>`__ 
     - ✅
+     - ✗
     - ✅ 
     - ✅
     - 
@@ -200,6 +227,7 @@ Feature x Feature
     - ✗
     - ✅ 
     - ✗ 
+     - ✗
     - ✅
     - ✅
     - 
@@ -216,6 +244,7 @@ Feature x Feature
     - ✗
     - ✅
     - ✗ 
+     - ✗
     - ✅
     - `✗ <https://github.com/vllm-project/vllm/issues/8198>`__ 
     - ✅
@@ -224,14 +253,15 @@ Feature x Feature
     - 
     - 
     - 
-   * - :abbr:`MM (Multimodal)`
+   * - :abbr:`mm (Multimodal)`
-     -  `✗ <https://github.com/vllm-project/vllm/pull/8346>`__ 
+     - ✅
     -  `✗ <https://github.com/vllm-project/vllm/pull/8348>`__ 
     -  `✗ <https://github.com/vllm-project/vllm/pull/7199>`__ 
     - ?
     - ?
     - ✅
-     - ✗
+     - ✅
+     - ✅
     - ✅
     - ✅
     - ✅
@@ -247,6 +277,7 @@ Feature x Feature
     - ✅
     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
     - ✅
+     - ✗
     - ✅
     - ✅
     - ✅
@@ -263,6 +294,7 @@ Feature x Feature
     - ✅
     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
     - ✅
+     - ✗
     - ✅
     - ✅
     - ✅
@@ -279,6 +311,7 @@ Feature x Feature
     - ?
     - ✅
     - ✅
+     - ✗
     - ?
     - ✅
     - ✅
@@ -353,6 +386,14 @@ Feature x Hardware
     - ✅
     - ✗
     - ✅
+   * - :abbr:`emd (Embedding Models)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
   * - :abbr:`enc-dec (Encoder-Decoder Models)`
     - ✅
     - ✅
@@ -361,7 +402,7 @@ Feature x Hardware
     - ✅
     - ✅
     - ✗
-   * - :abbr:`logP (Logprobs)`
+   * - :abbr:`mm (Multimodal)`
     - ✅
     - ✅
     - ✅
@@ -369,7 +410,7 @@ Feature x Hardware
     - ✅
     - ✅
     - ✅
-   * - :abbr:`prmpt logP (Prompt Logprobs)`
+   * - :abbr:`logP (Logprobs)`
     - ✅
     - ✅
     - ✅
@@ -377,29 +418,29 @@ Feature x Hardware
     - ✅
     - ✅
     - ✅
-   * - :abbr:`async output (Async Output Processing)`
+   * - :abbr:`prmpt logP (Prompt Logprobs)`
     - ✅
     - ✅
     - ✅
     - ✅
     - ✅
-     - ✗
-     - ✗
-   * - multi-step
     - ✅
     - ✅
+   * - :abbr:`async output (Async Output Processing)`
     - ✅
     - ✅
     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8477>`__ 
     - ✅
-   * - :abbr:`MM (Multimodal)`
     - ✅
+     - ✗
+     - ✗
+   * - multi-step
     - ✅
     - ✅
     - ✅
     - ✅
     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/8477>`__ 
     - ✅
   * - best-of
     - ✅

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1014,7 +1014,8 @@ class EngineArgs:
                use_spec_decode = self.speculative_model is not None
                if (is_gpu and not use_sliding_window and not use_spec_decode
                        and not self.enable_lora
-                        and not self.enable_prompt_adapter):
+                        and not self.enable_prompt_adapter
+                        and model_config.task != "embedding"):
                    self.enable_chunked_prefill = True
                    logger.warning(
                        "Chunked prefill is enabled by default for models with "
@@ -1031,6 +1032,9 @@ class EngineArgs:
                "errors during the initial memory profiling phase, or result "
                "in low performance due to small KV cache space. Consider "
                "setting --max-model-len to a smaller value.", max_model_len)
+        elif self.enable_chunked_prefill and model_config.task == "embedding":
+            msg = "Chunked prefill is not supported for embedding models"
+            raise ValueError(msg)
        speculative_config = SpeculativeConfig.maybe_create_spec_config(
            target_model_config=model_config,