Unverified Commit a95d5589 authored by Gleb Drozdov's avatar Gleb Drozdov Committed by GitHub
Browse files

Add matched_stop token or str to distinguish between eos or stop str...

Add matched_stop token or str to distinguish between eos or stop str finish_reason generation (#1684)
parent d17d19e5
......@@ -621,16 +621,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
else:
logprobs = None
finish_reason = ret_item["meta_info"]["finish_reason"]
if to_file:
# to make the choise data json serializable
choice_data = {
"index": 0,
"text": text,
"logprobs": logprobs,
"finish_reason": (
ret_item["meta_info"]["finish_reason"]["type"]
if ret_item["meta_info"]["finish_reason"]
else ""
"finish_reason": (finish_reason["type"] if finish_reason else ""),
"matched_stop": (
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
}
else:
......@@ -638,10 +641,11 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
index=idx,
text=text,
logprobs=logprobs,
finish_reason=(
ret_item["meta_info"]["finish_reason"]["type"]
if ret_item["meta_info"]["finish_reason"]
else ""
finish_reason=(finish_reason["type"] if finish_reason else ""),
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
)
......@@ -771,14 +775,16 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
delta = text[len(stream_buffer) :]
stream_buffer = stream_buffer + delta
finish_reason = content["meta_info"]["finish_reason"]
choice_data = CompletionResponseStreamChoice(
index=index,
text=delta,
logprobs=logprobs,
finish_reason=(
content["meta_info"]["finish_reason"]["type"]
if content["meta_info"]["finish_reason"]
else ""
finish_reason=(finish_reason["type"] if finish_reason else ""),
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
)
chunk = CompletionStreamResponse(
......@@ -1016,16 +1022,19 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
else:
choice_logprobs = None
finish_reason = ret_item["meta_info"]["finish_reason"]
if to_file:
# to make the choice data json serializable
choice_data = {
"index": 0,
"message": {"role": "assistant", "content": ret_item["text"]},
"logprobs": choice_logprobs,
"finish_reason": (
ret_item["meta_info"]["finish_reason"]["type"]
if ret_item["meta_info"]["finish_reason"]
else ""
"finish_reason": (finish_reason["type"] if finish_reason else ""),
"matched_stop": (
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
}
else:
......@@ -1033,10 +1042,11 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
index=idx,
message=ChatMessage(role="assistant", content=ret_item["text"]),
logprobs=choice_logprobs,
finish_reason=(
ret_item["meta_info"]["finish_reason"]["type"]
if ret_item["meta_info"]["finish_reason"]
else ""
finish_reason=(finish_reason["type"] if finish_reason else ""),
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
)
......@@ -1159,6 +1169,8 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
else:
choice_logprobs = None
finish_reason = content["meta_info"]["finish_reason"]
if is_first:
# First chunk with role
is_first = False
......@@ -1166,9 +1178,12 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
index=index,
delta=DeltaMessage(role="assistant"),
finish_reason=(
content["meta_info"]["finish_reason"]["type"]
if content["meta_info"]["finish_reason"]
else ""
finish_reason["type"] if finish_reason else ""
),
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
logprobs=choice_logprobs,
)
......@@ -1185,10 +1200,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
choice_data = ChatCompletionResponseStreamChoice(
index=index,
delta=DeltaMessage(content=delta),
finish_reason=(
content["meta_info"]["finish_reason"]["type"]
if content["meta_info"]["finish_reason"]
else ""
finish_reason=(finish_reason["type"] if finish_reason else ""),
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
logprobs=choice_logprobs,
)
......
......@@ -184,6 +184,7 @@ class CompletionResponseChoice(BaseModel):
text: str
logprobs: Optional[LogProbs] = None
finish_reason: Optional[str] = None
matched_stop: Union[None, int, str] = None
class CompletionResponse(BaseModel):
......@@ -200,6 +201,7 @@ class CompletionResponseStreamChoice(BaseModel):
text: str
logprobs: Optional[LogProbs] = None
finish_reason: Optional[str] = None
matched_stop: Union[None, int, str] = None
class CompletionStreamResponse(BaseModel):
......@@ -291,6 +293,7 @@ class ChatCompletionResponseChoice(BaseModel):
message: ChatMessage
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
finish_reason: str
matched_stop: Union[None, int, str] = None
class ChatCompletionResponse(BaseModel):
......@@ -312,6 +315,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
delta: DeltaMessage
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
finish_reason: Optional[str] = None
matched_stop: Union[None, int, str] = None
class ChatCompletionStreamResponse(BaseModel):
......
import json
import unittest
import requests
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
MANY_NEW_TOKENS_PROMPT = """
Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long.
"""
class TestMatchedStop(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=300,
other_args=["--max-running-requests", "10"],
)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def run_completions_generation(
self,
prompt=MANY_NEW_TOKENS_PROMPT,
max_tokens=1,
stop=None,
finish_reason=None,
matched_stop=None,
):
payload = {
"prompt": prompt,
"model": self.model,
"temperature": 0,
"top_p": 1,
"max_tokens": max_tokens,
}
if stop is not None:
payload["stop"] = stop
response_completions = requests.post(
self.base_url + "/v1/completions",
json=payload,
)
print(json.dumps(response_completions.json()))
print("=" * 100)
assert (
response_completions.json()["choices"][0]["finish_reason"] == finish_reason
)
assert response_completions.json()["choices"][0]["matched_stop"] == matched_stop
def run_chat_completions_generation(
self,
prompt=MANY_NEW_TOKENS_PROMPT,
max_tokens=1,
stop=None,
finish_reason=None,
matched_stop=None,
):
chat_payload = {
"model": self.model,
"messages": [
{"role": "system", "content": "You are a helpful AI assistant"},
{"role": "user", "content": prompt},
],
"temperature": 0,
"top_p": 1,
"max_tokens": max_tokens,
}
if stop is not None:
chat_payload["stop"] = stop
response_chat = requests.post(
self.base_url + "/v1/chat/completions",
json=chat_payload,
)
print(json.dumps(response_chat.json()))
print("=" * 100)
assert response_chat.json()["choices"][0]["finish_reason"] == finish_reason
assert response_chat.json()["choices"][0]["matched_stop"] == matched_stop
def test_finish_stop_str(self):
self.run_completions_generation(
max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
)
self.run_chat_completions_generation(
max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
)
def test_finish_stop_eos(self):
llama_format_prompt = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
eos_token_id = 128009
self.run_completions_generation(
prompt=llama_format_prompt,
max_tokens=1000,
finish_reason="stop",
matched_stop=eos_token_id,
)
self.run_chat_completions_generation(
prompt="What is 2 + 2?",
max_tokens=1000,
finish_reason="stop",
matched_stop=eos_token_id,
)
def test_finish_length(self):
self.run_completions_generation(
max_tokens=5, finish_reason="length", matched_stop=None
)
self.run_chat_completions_generation(
max_tokens=5, finish_reason="length", matched_stop=None
)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment