[FEAT] JSON constrained support (#1125)

Co-authored-by: Yineng Zhang <me@zhyncs.com>

[FEAT] JSON constrained support (#1125)
Co-authored-by: Yineng Zhang <me@zhyncs.com>
9935f97b · havetc · GitHub · c5fe11a8 · 9935f97b · 9935f97b
Unverified Commit 9935f97b authored Aug 26, 2024 by havetc Committed by GitHub Aug 26, 2024
10 changed files
--- a/docs/en/sampling_params.md
+++ b/docs/en/sampling_params.md
@@ -60,6 +60,9 @@ spaces_between_special_tokens: bool = True,
 regex: Optional[str] = None,
 # Do parallel sampling and return `n` outputs.
 n: int = 1,
+# Constrains the output to follow a given JSON schema.
+# `regex` and `json_schema` cannot be set at the same time.
+json_schema: Optional[str] = None,
 ## Penalties. See [Performance Implications on Penalties] section below for more informations.

--- a/python/sglang/srt/constrained/fsm_cache.py
+++ b/python/sglang/srt/constrained/fsm_cache.py
@@ -15,6 +15,8 @@ limitations under the License.
 """Cache for the compressed finite state machine."""
+from outlines.fsm.json_schema import build_regex_from_schema
 from sglang.srt.constrained import RegexGuide, TransformerTokenizer
 from sglang.srt.constrained.base_tool_cache import BaseToolCache
@@ -26,9 +28,12 @@ class FSMCache(BaseToolCache):
        tokenizer_args_dict,
        enable=True,
        skip_tokenizer_init=False,
+        json_schema_mode=False,
    ):
        super().__init__(enable=enable)
+        self.json_schema_mode = json_schema_mode
        if (
            skip_tokenizer_init
            or tokenizer_path.endswith(".json")
@@ -72,5 +77,9 @@ class FSMCache(BaseToolCache):
                tokenizer_path, **tokenizer_args_dict
            )
-    def init_value(self, regex):
+    def init_value(self, value):
-        return RegexGuide(regex, self.outlines_tokenizer)
+        if self.json_schema_mode:
+            regex = build_regex_from_schema(value)
+            return RegexGuide(regex, self.outlines_tokenizer), regex
+        else:
+            return RegexGuide(value, self.outlines_tokenizer)
--- a/python/sglang/srt/constrained/jump_forward.py
+++ b/python/sglang/srt/constrained/jump_forward.py
@@ -23,6 +23,7 @@ from collections import defaultdict
 import interegular
 import outlines.caching
+from outlines.fsm.json_schema import build_regex_from_schema
 from sglang.srt.constrained import (
    FSMInfo,

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -268,7 +268,14 @@ class Req:
        all_text = self.origin_input_text + self.decoded_text + jump_forward_str
        all_ids = self.tokenizer.encode(all_text)
+        if not all_ids:
+            warnings.warn("Encoded all_text resulted in empty all_ids")
+            return False
        prompt_tokens = len(self.origin_input_ids_unpadded)
+        if prompt_tokens > len(all_ids):
+            warnings.warn("prompt_tokens is larger than encoded all_ids")
+            return False
        if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]:
            # TODO(lsyin): fix token fusion

--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -197,6 +197,16 @@ class ModelTpServer:
                    "trust_remote_code": server_args.trust_remote_code,
                },
                skip_tokenizer_init=server_args.skip_tokenizer_init,
+                json_schema_mode=False,
+            )
+            self.json_fsm_cache = FSMCache(
+                server_args.tokenizer_path,
+                {
+                    "tokenizer_mode": server_args.tokenizer_mode,
+                    "trust_remote_code": server_args.trust_remote_code,
+                },
+                skip_tokenizer_init=server_args.skip_tokenizer_init,
+                json_schema_mode=True,
            )
        self.jump_forward_cache = JumpForwardCache()
@@ -349,8 +359,17 @@ class ModelTpServer:
            req.top_logprobs_num = recv_req.top_logprobs_num
            req.stream = recv_req.stream
+            # Init regex fsm fron json
+            if req.sampling_params.json_schema is not None:
+                req.regex_fsm, computed_regex_string = self.json_fsm_cache.query(
+                    req.sampling_params.json_schema
+                )
+                if not self.disable_regex_jump_forward:
+                    req.jump_forward_map = self.jump_forward_cache.query(
+                        computed_regex_string
+                    )
            # Init regex fsm
-            if req.sampling_params.regex is not None:
+            elif req.sampling_params.regex is not None:
                req.regex_fsm = self.regex_fsm_cache.query(req.sampling_params.regex)
                if not self.disable_regex_jump_forward:
                    req.jump_forward_map = self.jump_forward_cache.query(

--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -434,6 +434,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
                "frequency_penalty": request.frequency_penalty,
                "repetition_penalty": request.repetition_penalty,
                "regex": request.regex,
+                "json_schema": request.json_schema,
                "n": request.n,
                "ignore_eos": request.ignore_eos,
            }
@@ -802,6 +803,7 @@ def v1_chat_generate_request(
                "frequency_penalty": request.frequency_penalty,
                "repetition_penalty": request.repetition_penalty,
                "regex": request.regex,
+                "json_schema": request.json_schema,
                "n": request.n,
            }
        )

--- a/python/sglang/srt/openai_api/protocol.py
+++ b/python/sglang/srt/openai_api/protocol.py
@@ -161,6 +161,7 @@ class CompletionRequest(BaseModel):
    # Extra parameters for SRT backend only and will be ignored by OpenAI models.
    regex: Optional[str] = None
+    json_schema: Optional[str] = None
    ignore_eos: Optional[bool] = False
    min_tokens: Optional[int] = 0
    repetition_penalty: Optional[float] = 1.0
@@ -262,6 +263,7 @@ class ChatCompletionRequest(BaseModel):
    # Extra parameters for SRT backend only and will be ignored by OpenAI models.
    regex: Optional[str] = None
+    json_schema: Optional[str] = None
    min_tokens: Optional[int] = 0
    repetition_penalty: Optional[float] = 1.0
    stop_token_ids: Optional[List[int]] = Field(default_factory=list)

--- a/python/sglang/srt/sampling/sampling_params.py
+++ b/python/sglang/srt/sampling/sampling_params.py
@@ -39,6 +39,7 @@ class SamplingParams:
        spaces_between_special_tokens: bool = True,
        regex: Optional[str] = None,
        n: int = 1,
+        json_schema: Optional[str] = None,
    ) -> None:
        self.temperature = temperature
        self.top_p = top_p
@@ -56,6 +57,7 @@ class SamplingParams:
        self.spaces_between_special_tokens = spaces_between_special_tokens
        self.regex = regex
        self.n = n
+        self.json_schema = json_schema
        # Process some special cases
        if self.temperature < _SAMPLING_EPS:
@@ -106,6 +108,8 @@ class SamplingParams:
                    f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
                    f"{self.min_new_tokens}."
                )
+        if self.regex is not None and self.json_schema is not None:
+            raise ValueError("regex and json_schema cannot be both set.")
    def normalize(self, tokenizer):
        # Process stop strings

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -13,6 +13,7 @@ suites = {
        "test_eval_accuracy_mini.py",
        "test_large_max_new_tokens.py",
        "test_openai_server.py",
+        "test_json_constrained.py",
        "test_skip_tokenizer_init.py",
        "test_torch_compile.py",
        "test_triton_attn_backend.py",

--- a/test/srt/test_json_constrained.py
+++ b/test/srt/test_json_constrained.py
+import json
+import unittest
+import openai
+import requests
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+class TestJSONConstrained(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.json_schema = json.dumps(
+            {
+                "type": "object",
+                "properties": {
+                    "name": {"type": "string", "pattern": "^[\\w]+$"},
+                    "population": {"type": "integer"},
+                },
+                "required": ["name", "population"],
+            }
+        )
+        cls.process = popen_launch_server(
+            cls.model, cls.base_url, timeout=300, api_key=cls.api_key
+        )
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+    def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1):
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0 if n == 1 else 0.5,
+                    "max_new_tokens": 128,
+                    "n": n,
+                    "stop_token_ids": [119690],
+                    "json_schema": self.json_schema,
+                },
+                "stream": False,
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "logprob_start_len": 0,
+            },
+            headers=headers,
+        )
+        print(json.dumps(response.json()))
+        print("=" * 100)
+        try:
+            js_obj = json.loads(response.json()["text"])
+        except (TypeError, json.decoder.JSONDecodeError):
+            raise
+        assert isinstance(js_obj["name"], str)
+        assert isinstance(js_obj["population"], int)
+    def test_json_generate(self):
+        self.run_decode()
+    def test_json_openai(self):
+        client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": "Introduce the capital of France."},
+            ],
+            temperature=0,
+            max_tokens=128,
+            extra_body={"json_schema": self.json_schema},
+        )
+        text = response.choices[0].message.content
+        try:
+            js_obj = json.loads(text)
+        except (TypeError, json.decoder.JSONDecodeError):
+            print("JSONDecodeError", text)
+            raise
+        assert isinstance(js_obj["name"], str)
+        assert isinstance(js_obj["population"], int)
+if __name__ == "__main__":
+    unittest.main()