Unverified Commit f04d5226 authored by Flora Feng's avatar Flora Feng Committed by GitHub
Browse files

[CI] Fix flaky tool_use chat completion tests with deterministic seed (#37027)


Signed-off-by: default avatarsfeng33 <4florafeng@gmail.com>
parent 0a0a1a19
...@@ -6,6 +6,7 @@ import pytest ...@@ -6,6 +6,7 @@ import pytest
from .utils import ( from .utils import (
MESSAGES_WITHOUT_TOOLS, MESSAGES_WITHOUT_TOOLS,
SEED,
WEATHER_TOOL, WEATHER_TOOL,
ServerConfig, ServerConfig,
ensure_system_prompt, ensure_system_prompt,
...@@ -27,6 +28,7 @@ async def test_chat_completion_without_tools( ...@@ -27,6 +28,7 @@ async def test_chat_completion_without_tools(
max_completion_tokens=150, max_completion_tokens=150,
model=model_name, model=model_name,
logprobs=False, logprobs=False,
seed=SEED,
) )
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
stop_reason = chat_completion.choices[0].finish_reason stop_reason = chat_completion.choices[0].finish_reason
...@@ -47,6 +49,7 @@ async def test_chat_completion_without_tools( ...@@ -47,6 +49,7 @@ async def test_chat_completion_without_tools(
max_completion_tokens=150, max_completion_tokens=150,
model=model_name, model=model_name,
logprobs=False, logprobs=False,
seed=SEED,
stream=True, stream=True,
) )
chunks: list[str] = [] chunks: list[str] = []
...@@ -97,6 +100,7 @@ async def test_chat_completion_with_tools( ...@@ -97,6 +100,7 @@ async def test_chat_completion_with_tools(
model=model_name, model=model_name,
tools=[WEATHER_TOOL], tools=[WEATHER_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
) )
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
stop_reason = chat_completion.choices[0].finish_reason stop_reason = chat_completion.choices[0].finish_reason
...@@ -118,6 +122,7 @@ async def test_chat_completion_with_tools( ...@@ -118,6 +122,7 @@ async def test_chat_completion_with_tools(
model=model_name, model=model_name,
logprobs=False, logprobs=False,
tools=[WEATHER_TOOL], tools=[WEATHER_TOOL],
seed=SEED,
stream=True, stream=True,
) )
......
...@@ -10,6 +10,7 @@ from .utils import ( ...@@ -10,6 +10,7 @@ from .utils import (
MESSAGES_ASKING_FOR_PARALLEL_TOOLS, MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
SEARCH_TOOL, SEARCH_TOOL,
SEED,
WEATHER_TOOL, WEATHER_TOOL,
ServerConfig, ServerConfig,
) )
...@@ -39,6 +40,7 @@ async def test_parallel_tool_calls( ...@@ -39,6 +40,7 @@ async def test_parallel_tool_calls(
model=model_name, model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
) )
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
...@@ -76,6 +78,7 @@ async def test_parallel_tool_calls( ...@@ -76,6 +78,7 @@ async def test_parallel_tool_calls(
max_completion_tokens=200, max_completion_tokens=200,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
stream=True, stream=True,
) )
...@@ -166,6 +169,7 @@ async def test_parallel_tool_calls_with_results( ...@@ -166,6 +169,7 @@ async def test_parallel_tool_calls_with_results(
model=model_name, model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
) )
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
...@@ -184,6 +188,7 @@ async def test_parallel_tool_calls_with_results( ...@@ -184,6 +188,7 @@ async def test_parallel_tool_calls_with_results(
model=model_name, model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
stream=True, stream=True,
) )
...@@ -229,6 +234,7 @@ async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI): ...@@ -229,6 +234,7 @@ async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI):
model=model_name, model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
parallel_tool_calls=False, parallel_tool_calls=False,
) )
...@@ -247,6 +253,7 @@ async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI): ...@@ -247,6 +253,7 @@ async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI):
max_completion_tokens=200, max_completion_tokens=200,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
parallel_tool_calls=False, parallel_tool_calls=False,
stream=True, stream=True,
) )
......
...@@ -10,6 +10,7 @@ from .utils import ( ...@@ -10,6 +10,7 @@ from .utils import (
MESSAGES_ASKING_FOR_TOOLS, MESSAGES_ASKING_FOR_TOOLS,
MESSAGES_WITH_TOOL_RESPONSE, MESSAGES_WITH_TOOL_RESPONSE,
SEARCH_TOOL, SEARCH_TOOL,
SEED,
WEATHER_TOOL, WEATHER_TOOL,
) )
...@@ -27,6 +28,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): ...@@ -27,6 +28,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
model=model_name, model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
) )
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
...@@ -71,6 +73,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): ...@@ -71,6 +73,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
max_completion_tokens=100, max_completion_tokens=100,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
stream=True, stream=True,
) )
...@@ -154,6 +157,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI): ...@@ -154,6 +157,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
model=model_name, model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
) )
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
...@@ -171,6 +175,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI): ...@@ -171,6 +175,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
model=model_name, model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL], tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False, logprobs=False,
seed=SEED,
stream=True, stream=True,
) )
......
...@@ -42,6 +42,8 @@ def ensure_system_prompt( ...@@ -42,6 +42,8 @@ def ensure_system_prompt(
# universal args for all models go here. also good if you need to test locally # universal args for all models go here. also good if you need to test locally
# and change type or KV cache quantization or something. # and change type or KV cache quantization or something.
SEED = 42
ARGS: list[str] = [ ARGS: list[str] = [
"--enable-auto-tool-choice", "--enable-auto-tool-choice",
"--max-model-len", "--max-model-len",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment