[Bugfix] Fix OpenAI parallel sampling when using xgrammar (#11637)

Signed-off-by: mgoin <michael@neuralmagic.com>

[Bugfix] Fix OpenAI parallel sampling when using xgrammar (#11637)
Signed-off-by: mgoin <michael@neuralmagic.com>
74fa1d12 · Michael Goin · GitHub · a2a40bcd · 74fa1d12 · 74fa1d12
Unverified Commit 74fa1d12 authored Dec 30, 2024 by Michael Goin Committed by GitHub Dec 31, 2024
4 changed files
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -28,6 +28,8 @@ PA_NAME = "swapnilbp/llama_tweet_ptune"
 # need to change to match the prompt adapter
 PA_NUM_VIRTUAL_TOKENS = 8
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -635,8 +637,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer"])
 async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                      guided_decoding_backend: str,
                                      sample_json_schema):
@@ -658,8 +659,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer"])
 async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str,
                                       sample_regex):
@@ -680,8 +680,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer"])
 async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str,
                                        sample_guided_choice):
@@ -761,8 +760,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer"])
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                          guided_decoding_backend: str,
                                          sample_json_schema, sample_regex):

--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
 # noqa: UP007
 from __future__ import annotations
+import copy
 import json
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
@@ -309,3 +310,7 @@ class XGrammarLogitsProcessor:
            scores = scores.to(device_type).squeeze()
        return scores
+    def clone(self) -> XGrammarLogitsProcessor:
+        """Deepcopy due to per-sequence state in the matchers"""
+        return copy.deepcopy(self)
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -450,15 +450,16 @@ class SamplingParams(
        return self._all_stop_token_ids
    def clone(self) -> "SamplingParams":
-        """Deep copy excluding LogitsProcessor objects.
+        """Deep copy, but maybe not the LogitsProcessor objects.
-        LogitsProcessor objects are excluded because they may contain an
+        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
-        arbitrary, nontrivial amount of data.
+        data that is expensive to copy. However, if not copied, the processor
+        needs to support parallel decoding for multiple sequences
        See https://github.com/vllm-project/vllm/issues/3087
        """
        logit_processor_refs = None if self.logits_processors is None else {
-            id(lp): lp
+            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
            for lp in self.logits_processors
        }
        return copy.deepcopy(self, memo=logit_processor_refs)

--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1372,7 +1372,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
    @staticmethod
    def add_request(request_id: str, engine, params, **kwargs):
        original_params = params
-        params = copy.deepcopy(original_params)
+        params = original_params.clone()
        params.n = 1
        group = ParallelSampleSequenceGroup(request_id)
        seqs = []