test_harmony.py 40.7 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
"""Integration tests for the Harmony-based Responses API."""

from __future__ import annotations

7
import importlib.util
8
import json
9
import logging
10
import time
11
from typing import Any
12
13
14

import pytest
import pytest_asyncio
15
import requests
16
from openai import InternalServerError, NotFoundError, OpenAI
17
from openai_harmony import Message
18

19
20
from tests.utils import RemoteOpenAIServer

21
22
23
24
25
26
27
28
29
30
from .conftest import (
    BASE_TEST_ENV,
    events_contain_type,
    has_output_type,
    retry_for_tool_call,
    retry_streaming_for,
    validate_streaming_event_stack,
)

logger = logging.getLogger(__name__)
31
32
33

MODEL_NAME = "openai/gpt-oss-20b"

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
GET_WEATHER_SCHEMA = {
    "type": "function",
    "name": "get_weather",
    "description": "Get current temperature for provided coordinates in celsius.",  # noqa
    "parameters": {
        "type": "object",
        "properties": {
            "latitude": {"type": "number"},
            "longitude": {"type": "number"},
        },
        "required": ["latitude", "longitude"],
        "additionalProperties": False,
    },
    "strict": True,
}

50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def get_weather(latitude, longitude):
    try:
        response = requests.get(
            f"https://api.open-meteo.com/v1/forecast?"
            f"latitude={latitude}&longitude={longitude}"
            f"&current=temperature_2m,wind_speed_10m"
            f"&hourly=temperature_2m,relative_humidity_2m,"
            f"wind_speed_10m",
            timeout=10,
        )
        data = response.json()
        return data["current"]["temperature_2m"]
    except (requests.RequestException, KeyError) as e:
        logger.warning(
            "External weather API call failed (%s), "
            "returning fake value. This does not affect "
            "test correctness — only the tool-calling "
            "protocol is under test.",
            e,
        )
        return 15.0


def get_place_to_travel():
    return "Paris"


def get_horoscope(sign):
    return f"{sign}: Next Tuesday you will befriend a baby otter."


def call_function(name, args):
    logger.info("Calling function %s with args %s", name, args)
    dispatch = {
        "get_weather": lambda: get_weather(**args),
        "get_place_to_travel": lambda: get_place_to_travel(),
        "get_horoscope": lambda: get_horoscope(**args),
    }
    if name not in dispatch:
        raise ValueError(f"Unknown function: {name}")
    result = dispatch[name]()
    logger.info("Function %s returned: %s", name, result)
    return result


96
@pytest.fixture(scope="module")
97
def server():
98
99
100
    assert importlib.util.find_spec("gpt_oss") is not None, (
        "Harmony tests require gpt_oss package to be installed"
    )
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    args = [
        "--enforce-eager",
        "--tool-server",
        "demo",
        "--max_model_len",
        "5000",
    ]
    env_dict = {
        **BASE_TEST_ENV,
        "VLLM_ENABLE_RESPONSES_API_STORE": "1",
        "PYTHON_EXECUTION_BACKEND": "dangerously_use_uv",
        "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS": (
            "code_interpreter,container,web_search_preview"
        ),
        "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": "1",
    }
117
118
    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
        yield remote_server
119
120
121
122
123
124
125
126
127
128
129
130
131


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
132
        input="What is 123 * 456?",
133
134
135
136
137
138
139
140
141
142
143
    )
    assert response is not None
    print("response: ", response)
    assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic_with_instructions(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
144
        input="What is 123 * 456?",
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
        instructions="Respond in Korean.",
    )
    assert response is not None
    assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input="What is the capital of South Korea?",
        reasoning={"effort": "low"},
    )
    assert response is not None
    assert response.status == "completed"


163
164
165
166
167
168
169
170
171
172
173
174
175
176
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input="What is the first paragraph of Moby Dick?",
        reasoning={"effort": "low"},
        max_output_tokens=30,
    )
    assert response is not None
    assert response.status == "incomplete"
    assert response.incomplete_details.reason == "max_output_tokens"


177
178
179
180
181
182
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input=[
183
184
185
            {"role": "system", "content": "Respond in Korean."},
            {"role": "user", "content": "Hello!"},
            {"role": "assistant", "content": "Hello! How can I help you today?"},
186
            {"role": "user", "content": "What is 123 * 456? Explain your answer."},
187
188
189
190
191
192
193
194
195
196
197
198
199
200
        ],
    )
    assert response is not None
    assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_with_input_type(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input=[
            {
                "role": "user",
201
                "content": [{"type": "input_text", "text": "What is 123 * 456?"}],
202
203
204
205
206
207
208
209
210
211
212
213
214
            },
        ],
    )
    assert response is not None
    assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_structured_output(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input=[
215
            {"role": "system", "content": "Extract the event information."},
216
217
            {
                "role": "user",
218
                "content": "Alice and Bob are going to a science fair on Friday.",
219
220
221
222
223
224
225
226
227
            },
        ],
        text={
            "format": {
                "type": "json_schema",
                "name": "calendar_event",
                "schema": {
                    "type": "object",
                    "properties": {
228
229
                        "name": {"type": "string"},
                        "date": {"type": "string"},
230
231
232
233
                        "participants": {
                            "type": "array",
                            "items": {"type": "string"},
                        },
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
                    },
                    "required": ["name", "date", "participants"],
                    "additionalProperties": False,
                },
                "description": "A calendar event.",
                "strict": True,
            }
        },
    )
    assert response is not None
    assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_structured_output_with_parse(client: OpenAI, model_name: str):
    from pydantic import BaseModel

    class CalendarEvent(BaseModel):
        name: str
        date: str
        participants: list[str]

    response = await client.responses.parse(
        model=model_name,
        input="Alice and Bob are going to a science fair on Friday",
        instructions="Extract the event information",
        text_format=CalendarEvent,
    )
    assert response is not None
    assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_store(client: OpenAI, model_name: str):
    for store in [True, False]:
        response = await client.responses.create(
            model=model_name,
273
            input="What is 123 * 456?",
274
275
276
277
278
279
280
281
282
283
            store=store,
        )
        assert response is not None

        try:
            _retrieved_response = await client.responses.retrieve(response.id)
            is_not_found = False
        except NotFoundError:
            is_not_found = True

284
285
286
        assert is_not_found == (not store), (
            f"store={store}: expected not_found={not store}, got {is_not_found}"
        )
287
288
289
290
291
292
293


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_background(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
294
        input="What is 123 * 456?",
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
        background=True,
    )
    assert response is not None

    retries = 0
    max_retries = 30
    while retries < max_retries:
        response = await client.responses.retrieve(response.id)
        if response.status == "completed":
            break
        time.sleep(1)
        retries += 1

    assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_background_cancel(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input="Write a long story about a cat.",
        background=True,
    )
    assert response is not None
    time.sleep(1)

    cancelled_response = await client.responses.cancel(response.id)
    assert cancelled_response is not None


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_stateful_multi_turn(client: OpenAI, model_name: str):
    response1 = await client.responses.create(
330
        model=model_name, input="What is 123 * 456?"
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
    )
    assert response1.status == "completed"

    response2 = await client.responses.create(
        model=model_name,
        input="What if I increase both numbers by 1?",
        previous_response_id=response1.id,
    )
    assert response2.status == "completed"

    response3 = await client.responses.create(
        model=model_name,
        input="Divide the result by 2.",
        previous_response_id=response2.id,
    )
    assert response3.status == "completed"


349
350
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
351
352
353
async def test_streaming_types(
    pairs_of_event_types: dict[str, str], client: OpenAI, model_name: str
):
354
355
356
357
358
359
360
361
362
363
364
    stream = await client.responses.create(
        model=model_name,
        input="tell me a story about a cat in 20 words",
        reasoning={"effort": "low"},
        tools=[],
        stream=True,
        background=False,
    )
    events = []
    async for event in stream:
        events.append(event)
365

366
    validate_streaming_event_stack(events, pairs_of_event_types)
367
368


369
370
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
371
372
373
async def test_function_calling_with_streaming_types(
    pairs_of_event_types: dict[str, str], client: OpenAI, model_name: str
):
374
375
376
377
378
379
380
    """Streaming event nesting for function-calling responses."""

    def _has_function_events(evts: list) -> bool:
        return events_contain_type(evts, "function_call_arguments")

    events = await retry_streaming_for(
        client,
381
        model=model_name,
382
383
384
385
        validate_events=_has_function_events,
        input=[{"role": "user", "content": "What's the weather like in Paris today?"}],
        tools=[GET_WEATHER_SCHEMA],
        temperature=0.0,
386
387
    )

388
    validate_streaming_event_stack(events, pairs_of_event_types)
389
390


391
392
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
393
394
@pytest.mark.parametrize("background", [True, False])
async def test_streaming(client: OpenAI, model_name: str, background: bool):
395
    # TODO: Add back when web search and code interpreter are available in CI
396
397
    prompts = [
        "tell me a story about a cat in 20 words",
398
        "What is 123 * 456? Use python to calculate the result.",
399
        # "When did Jensen found NVIDIA? Search it and answer the year only.",
400
401
402
    ]

    for prompt in prompts:
403
        stream = await client.responses.create(
404
405
406
407
            model=model_name,
            input=prompt,
            reasoning={"effort": "low"},
            tools=[
408
409
410
                # {
                #     "type": "web_search_preview"
                # },
411
                {"type": "code_interpreter", "container": {"type": "auto"}},
412
413
            ],
            stream=True,
414
            background=background,
415
            extra_body={"enable_response_messages": True},
416
417
        )

418
419
420
        current_item_id = ""
        current_content_index = -1

421
422
        events = []
        current_event_mode = None
423
        resp_id = None
424
        checked_response_completed = False
425
426

        async for event in stream:
427
428
429
            if event.type == "response.created":
                resp_id = event.response.id

430
            # Validate custom fields on response-level events
431
            if event.type in [
432
433
434
                "response.completed",
                "response.in_progress",
                "response.created",
435
            ]:
436
437
                assert "input_messages" in event.response.model_extra
                assert "output_messages" in event.response.model_extra
438
439
440
441
442
443
444
445
446
447
                if event.type == "response.completed":
                    # make sure the serialization of content works
                    for msg in event.response.model_extra["output_messages"]:
                        # make sure we can convert the messages back into harmony
                        Message.from_dict(msg)

                    for msg in event.response.model_extra["input_messages"]:
                        # make sure we can convert the messages back into harmony
                        Message.from_dict(msg)
                    checked_response_completed = True
448

449
450
            if current_event_mode != event.type:
                current_event_mode = event.type
451
                logger.debug("[%s] ", event.type)
452

453
            # Verify item IDs
454
455
456
457
            if event.type == "response.output_item.added":
                assert event.item.id != current_item_id
                current_item_id = event.item.id
            elif event.type in [
458
459
                "response.output_text.delta",
                "response.reasoning_text.delta",
460
461
462
            ]:
                assert event.item_id == current_item_id

463
            # Verify content indices
464
            if event.type in [
465
466
                "response.content_part.added",
                "response.reasoning_part.added",
467
            ]:
468
469
470
                assert event.content_index != current_content_index
                current_content_index = event.content_index
            elif event.type in [
471
472
                "response.output_text.delta",
                "response.reasoning_text.delta",
473
474
475
            ]:
                assert event.content_index == current_content_index

476
477
478
            events.append(event)

        assert len(events) > 0
479
        assert events[-1].response.output, "Final response should have output"
480
        assert checked_response_completed
481

482
483
484
        if background:
            starting_after = 5
            async with await client.responses.retrieve(
485
                response_id=resp_id, stream=True, starting_after=starting_after
486
            ) as replay_stream:
487
                counter = starting_after
488
                async for event in replay_stream:
489
490
                    counter += 1
                    assert event == events[counter]
491
            assert counter == len(events) - 1
492

493
494
495

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
496
@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
497
498
499
500
async def test_web_search(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input="Who is the president of South Korea as of now?",
501
        tools=[{"type": "web_search_preview"}],
502
503
504
505
506
507
508
509
    )
    assert response is not None
    assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_code_interpreter(client: OpenAI, model_name: str):
510
511
512
513
    timeout_value = client.timeout * 3
    client_with_timeout = client.with_options(timeout=timeout_value)

    response = await client_with_timeout.responses.create(
514
        model=model_name,
515
516
517
518
519
520
521
        input=(
            "What's the first 4 digits after the decimal point of "
            "cube root of `19910212 * 20250910`? "
            "Show only the digits. The python interpreter is not stateful "
            "and you must print to see the output."
        ),
        tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
522
        temperature=0.0,
523
524
525
    )
    assert response is not None
    assert response.status == "completed"
526
    assert response.usage.output_tokens_details.tool_output_tokens > 0
527

528
529
530
    for item in response.output:
        if item.type == "message":
            output_string = item.content[0].text
531
532
533
            assert "5846" in output_string, (
                f"Expected '5846' in output, got: {output_string}"
            )
534
535


536
537
538
539
540
541
542
543
544
545
546
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_reasoning_item(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
        input=[
            {"type": "message", "content": "Hello.", "role": "user"},
            {
                "type": "reasoning",
                "id": "lol",
                "content": [
547
                    {"type": "reasoning_text", "text": "We need to respond: greeting."}
548
549
550
551
552
553
554
555
556
557
                ],
                "summary": [],
            },
        ],
        temperature=0.0,
    )
    assert response is not None
    assert response.status == "completed"


558
559
560
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling(client: OpenAI, model_name: str):
561
    tools = [GET_WEATHER_SCHEMA]
562

563
564
    response = await retry_for_tool_call(
        client,
565
        model=model_name,
566
        expected_tool_type="function_call",
567
568
        input="What's the weather like in Paris today?",
        tools=tools,
569
        temperature=0.0,
570
        extra_body={"request_id": "test_function_calling_non_resp"},
571
572
    )
    assert response.status == "completed"
573
574
575
576
    assert has_output_type(response, "function_call"), (
        f"Expected function_call in output, got: "
        f"{[getattr(o, 'type', None) for o in response.output]}"
    )
577

578
    tool_call = next(o for o in response.output if o.type == "function_call")
579
    args = json.loads(tool_call.arguments)
580
    result = call_function(tool_call.name, args)
581
582
583

    response_2 = await client.responses.create(
        model=model_name,
584
585
586
587
588
589
590
        input=[
            {
                "type": "function_call_output",
                "call_id": tool_call.call_id,
                "output": str(result),
            }
        ],
591
592
        tools=tools,
        previous_response_id=response.id,
593
        temperature=0.0,
594
595
596
597
598
599
600
601
602
603
    )
    assert response_2.status == "completed"
    assert response_2.output_text is not None

    # NOTE: chain-of-thought should be removed.
    response_3 = await client.responses.create(
        model=model_name,
        input="What's the weather like in Paris today?",
        tools=tools,
        previous_response_id=response_2.id,
604
        temperature=0.0,
605
606
607
608
609
610
611
612
    )
    assert response_3.status == "completed"
    assert response_3.output_text is not None


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
613
    """Multi-tool, multi-turn function calling with retry at API level."""
614
615
616
617
618
619
620
621
622
623
624
625
626
    tools = [
        {
            "type": "function",
            "name": "get_place_to_travel",
            "description": "Get a random place to travel",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
                "additionalProperties": False,
            },
            "strict": True,
        },
627
        GET_WEATHER_SCHEMA,
628
629
    ]

630
631
632
    # Turn 1: model should call one of the tools
    response = await retry_for_tool_call(
        client,
633
        model=model_name,
634
        expected_tool_type="function_call",
635
        input="Help me plan a trip to a random place. And tell me the weather there.",
636
        tools=tools,
637
        temperature=0.0,
638
639
    )
    assert response.status == "completed"
640
641
642
643
    assert has_output_type(response, "function_call"), (
        f"Turn 1: expected function_call, got: "
        f"{[getattr(o, 'type', None) for o in response.output]}"
    )
644

645
646
    tool_call = next(o for o in response.output if o.type == "function_call")
    result = call_function(tool_call.name, json.loads(tool_call.arguments))
647

648
649
650
    # Turn 2
    response_2 = await retry_for_tool_call(
        client,
651
        model=model_name,
652
        expected_tool_type="function_call",
653
654
655
656
657
658
659
        input=[
            {
                "type": "function_call_output",
                "call_id": tool_call.call_id,
                "output": str(result),
            }
        ],
660
661
        tools=tools,
        previous_response_id=response.id,
662
        temperature=0.0,
663
664
665
    )
    assert response_2.status == "completed"

666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
    # If model produced another tool call, execute it
    if has_output_type(response_2, "function_call"):
        tool_call_2 = next(o for o in response_2.output if o.type == "function_call")
        result_2 = call_function(tool_call_2.name, json.loads(tool_call_2.arguments))
        response_3 = await client.responses.create(
            model=model_name,
            input=[
                {
                    "type": "function_call_output",
                    "call_id": tool_call_2.call_id,
                    "output": str(result_2),
                }
            ],
            tools=tools,
            previous_response_id=response_2.id,
            temperature=0.0,
        )
        assert response_3.status == "completed"
        assert response_3.output_text is not None
    else:
        # Model went straight to answering - acceptable but unexpected.
        # Log as warning so it shows up in CI without failing the test.
        assert response_2.output_text is not None
        pytest.xfail(
            "Model went straight to answering instead of calling a "
            "second tool. Valid behaviour but not the expected path."
            "If this happens consistently, the prompt or model may have "
            "changed behaviour."
        )
695
696
697
698
699


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_required(client: OpenAI, model_name: str):
700
    tools = [GET_WEATHER_SCHEMA]
701

702
    with pytest.raises(InternalServerError):
703
704
705
706
707
708
709
710
        await client.responses.create(
            model=model_name,
            input="What's the weather like in Paris today?",
            tools=tools,
            tool_choice="required",
        )


711
712
713
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_system_message_with_tools(client: OpenAI, model_name: str):
714
    from vllm.entrypoints.openai.parser.harmony_utils import get_system_message
715

716
717
718
719
720
721
722
723
    # Commentary channel should always be present (needed for preambles)
    # regardless of whether custom tools are enabled
    for with_tools in (True, False):
        sys_msg = get_system_message(with_custom_tools=with_tools)
        valid_channels = sys_msg.content[0].channel_config.valid_channels
        assert "commentary" in valid_channels, (
            f"commentary channel missing when with_custom_tools={with_tools}"
        )
724
725


726
727
728
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_full_history(client: OpenAI, model_name: str):
729
    tools = [GET_WEATHER_SCHEMA]
730

731
732
733
    input_messages = [
        {"role": "user", "content": "What's the weather like in Paris today?"}
    ]
734

735
736
    response = await retry_for_tool_call(
        client,
737
        model=model_name,
738
        expected_tool_type="function_call",
739
740
        input=input_messages,
        tools=tools,
741
        temperature=0.0,
742
743
744
    )
    assert response.status == "completed"

745
746
747
748
749
    tool_call = next((o for o in response.output if o.type == "function_call"), None)
    assert tool_call is not None, (
        f"Expected function_call in output, got: "
        f"{[getattr(o, 'type', None) for o in response.output]}"
    )
750

751
    result = call_function(tool_call.name, json.loads(tool_call.arguments))
752

753
    input_messages.extend(response.output)
754
755
756
757
758
759
760
761
762
763
764
765
    input_messages.append(
        {  # append result message
            "type": "function_call_output",
            "call_id": tool_call.call_id,
            "output": str(result),
        }
    )

    response_2 = await client.responses.create(
        model=model_name,
        input=input_messages,
        tools=tools,
766
        temperature=0.0,
767
768
769
    )
    assert response_2.status == "completed"
    assert response_2.output_text is not None
770
771


772
773
774
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_with_stream(client: OpenAI, model_name: str):
775
    """Function calling via streaming, with retry for non-determinism."""
776
777
    tools = [GET_WEATHER_SCHEMA]
    input_list = [
778
        {"role": "user", "content": "What's the weather like in Paris today?"},
779
    ]
780
781
782
783
784
785
786
787
788
789

    def _has_function_call(evts: list) -> bool:
        return any(
            getattr(e, "type", "") == "response.output_item.added"
            and getattr(getattr(e, "item", None), "type", None) == "function_call"
            for e in evts
        )

    events = await retry_streaming_for(
        client,
790
        model=model_name,
791
        validate_events=_has_function_call,
792
793
        input=input_list,
        tools=tools,
794
        temperature=0.0,
795
    )
796
797
798
799

    # Parse tool calls from events
    final_tool_calls: dict[int, Any] = {}
    for event in events:
800
        if event.type == "response.output_item.added":
801
802
            if getattr(event.item, "type", None) == "function_call":
                final_tool_calls[event.output_index] = event.item
803
        elif event.type == "response.function_call_arguments.delta":
804
805
806
            tc = final_tool_calls.get(event.output_index)
            if tc:
                tc.arguments += event.delta
807
        elif event.type == "response.function_call_arguments.done":
808
809
810
811
812
            tc = final_tool_calls.get(event.output_index)
            if tc:
                assert event.arguments == tc.arguments

    # Find get_weather call
813
    tool_call = None
814
    result = None
815
    for tc in final_tool_calls.values():
816
        if getattr(tc, "type", None) == "function_call" and tc.name == "get_weather":
817
818
819
            args = json.loads(tc.arguments)
            result = call_function(tc.name, args)
            tool_call = tc
820
            input_list.append(tc)
821
            break
822
823

    assert tool_call is not None, (
824
825
        "Expected model to call 'get_weather', "
        f"but got: {[getattr(tc, 'name', None) for tc in final_tool_calls.values()]}"
826
    )
827
828

    # Second turn with the tool result
829
830
831
832
833
834
835
836
837
838
839
840
    response = await client.responses.create(
        model=model_name,
        input=input_list
        + [
            {
                "type": "function_call_output",
                "call_id": tool_call.call_id,
                "output": str(result),
            }
        ],
        tools=tools,
        stream=True,
841
        temperature=0.0,
842
843
844
845
846
847
848
849
850
851
852
    )
    async for event in response:
        # check that no function call events in the stream
        assert event.type != "response.function_call_arguments.delta"
        assert event.type != "response.function_call_arguments.done"
        # check that the response contains output text
        if event.type == "response.completed":
            assert len(event.response.output) > 0
            assert event.response.output_text is not None


853
854
855
856
857
858
859
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_no_code_interpreter_events(
    client: OpenAI, model_name: str
):
    """Verify that function calls don't trigger code_interpreter events.

860
861
862
    Uses retry_streaming_for to handle non-determinism: the model might not
    always produce a function_call, but if it does, code_interpreter events
    should NEVER appear.
863
864
865
    """
    tools = [GET_WEATHER_SCHEMA]
    input_list = [
866
        {"role": "user", "content": "What's the weather like in Paris today?"},
867
    ]
868
869
870
871
872
873
874
875
876
877

    def _has_function_call(evts: list) -> bool:
        return any(
            getattr(e, "type", "") == "response.output_item.added"
            and getattr(getattr(e, "item", None), "type", None) == "function_call"
            for e in evts
        )

    events = await retry_streaming_for(
        client,
878
        model=model_name,
879
        validate_events=_has_function_call,
880
881
        input=input_list,
        tools=tools,
882
        temperature=0.0,
883
884
    )

885
886
    event_types_seen = {e.type for e in events}
    function_call_found = _has_function_call(events)
887

888
889
890
891
    assert function_call_found, (
        f"Expected to see a function_call after retries. "
        f"Event types: {sorted(event_types_seen)}"
    )
892

893
894
    # The actual invariant under test
    for event in events:
895
        assert "code_interpreter" not in event.type, (
896
897
            f"Found code_interpreter event '{event.type}' during function call. "
            "Function calls should only emit function_call events."
898
899
900
901
902
903
904
905
906
907
908
        )

    # Verify we saw the correct function call event types
    assert (
        "response.function_call_arguments.delta" in event_types_seen
        or "response.function_call_arguments.done" in event_types_seen
    ), "Expected to see function_call_arguments events"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
Robert Shaw's avatar
Robert Shaw committed
909
910
911
912
@pytest.mark.skip(
    reason="This test is flaky in CI, needs investigation and "
    "potential fixes in the code interpreter MCP implementation."
)
913
914
915
916
917
918
async def test_code_interpreter_streaming(
    client: OpenAI,
    model_name: str,
    pairs_of_event_types: dict[str, str],
):
    tools = [{"type": "code_interpreter", "container": {"type": "auto"}}]
919
    input_text = (
920
        "Calculate 123 * 456 using python. "
921
922
        "The python interpreter is not stateful and you must "
        "print to see the output."
923
924
    )

925
926
    def _has_code_interpreter(evts: list) -> bool:
        return events_contain_type(evts, "code_interpreter")
927
928
929

    events = await retry_streaming_for(
        client,
930
        model=model_name,
931
        validate_events=_has_code_interpreter,
932
933
934
935
936
937
938
939
        input=input_text,
        tools=tools,
        temperature=0.0,
        instructions=(
            "You must use the Python tool to execute code. Never simulate execution."
        ),
    )

940
941
942
    event_types = [e.type for e in events]
    event_types_set = set(event_types)
    logger.info(
943
        "\n====== Code Interpreter Streaming Diagnostics ======\n"
944
945
946
        "Event count: %d\n"
        "Event types (in order): %s\n"
        "Unique event types: %s\n"
947
        "====================================================",
948
949
950
951
        len(events),
        event_types,
        sorted(event_types_set),
    )
952

953
954
    # Structural validation (pairing, ordering, field consistency)
    validate_streaming_event_stack(events, pairs_of_event_types)
955

956
    # Validate code interpreter item fields
957
    for event in events:
958
959
960
961
962
963
964
965
        if (
            event.type == "response.output_item.added"
            and hasattr(event.item, "type")
            and event.item.type == "code_interpreter_call"
        ):
            assert event.item.status == "in_progress"
        elif event.type == "response.code_interpreter_call_code.done":
            assert event.code is not None
966
967
968
        elif (
            event.type == "response.output_item.done"
            and hasattr(event.item, "type")
969
            and event.item.type == "code_interpreter_call"
970
971
        ):
            assert event.item.status == "completed"
972
            assert event.item.code is not None
973
974
975
976
977


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_multi_turn(client: OpenAI, model_name: str, server):
978
979
980
981
982
    """MCP tools work across multiple turns via previous_response_id."""
    tools = [{"type": "mcp", "server_label": "code_interpreter"}]
    instructions = (
        "You must use the Python tool to execute code. Never simulate execution."
    )
983

984
985
986
    # First turn
    response1 = await retry_for_tool_call(
        client,
987
        model=model_name,
988
        expected_tool_type="mcp_call",
989
        input="Calculate 1234 * 4567 using python tool and print the result.",
990
991
        tools=tools,
        temperature=0.0,
992
        instructions=instructions,
993
994
995
996
        extra_body={"enable_response_messages": True},
    )
    assert response1.status == "completed"

997
998
999
1000
1001
    # Verify MCP call in output_messages
    tool_call_found = any(
        (msg.get("recipient") or "").startswith("python")
        for msg in response1.output_messages
    )
1002
1003
1004
    parsed_output_messages = [
        Message.from_dict(msg) for msg in response1.output_messages
    ]
1005
    tool_response_found = any(
1006
1007
        (msg.author.role == "tool" and (msg.author.name or "").startswith("python"))
        for msg in parsed_output_messages
1008
    )
1009
1010
1011
    assert tool_call_found, "MCP tool call not found in output_messages"
    assert tool_response_found, "MCP tool response not found in output_messages"

1012
1013
    # No developer messages expected for elevated tools
    developer_msgs = [
1014
1015
1016
        msg
        for msg in (Message.from_dict(raw) for raw in response1.input_messages)
        if msg.author.role == "developer"
1017
    ]
1018
    assert len(developer_msgs) == 0, "No developer message expected for elevated tools"
1019

1020
    # Second turn
1021
1022
1023
1024
1025
    response2 = await client.responses.create(
        model=model_name,
        input="Now divide that result by 2.",
        tools=tools,
        temperature=0.0,
1026
        instructions=instructions,
1027
1028
1029
1030
1031
1032
        previous_response_id=response1.id,
        extra_body={"enable_response_messages": True},
    )
    assert response2.status == "completed"


1033
1034
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
1035
async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
1036
1037
1038
    response = await client.responses.create(
        model=model_name,
        input="What is the capital of South Korea?",
1039
1040
        extra_body={"enable_response_messages": True},
    )
1041
1042
1043
1044
1045

    assert response is not None
    assert response.status == "completed"
    assert len(response.input_messages) > 0
    assert len(response.output_messages) > 0
1046
1047
1048
1049
1050
1051
1052


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_call_with_previous_input_messages(
    client: OpenAI, model_name: str
):
1053
    """Multi-turn function calling using previous_input_messages."""
1054
1055
1056
1057
1058
1059
1060
    tools = [
        {
            "type": "function",
            "name": "get_horoscope",
            "description": "Get today's horoscope for an astrological sign.",
            "parameters": {
                "type": "object",
1061
                "properties": {"sign": {"type": "string"}},
1062
1063
1064
1065
1066
1067
1068
                "required": ["sign"],
                "additionalProperties": False,
            },
            "strict": True,
        }
    ]

1069
1070
1071
    # Step 1: Get a function call from the model
    response = await retry_for_tool_call(
        client,
1072
        model=model_name,
1073
        expected_tool_type="function_call",
1074
1075
        input="What is the horoscope for Aquarius today?",
        tools=tools,
1076
        temperature=0.0,
1077
        extra_body={"enable_response_messages": True},
1078
        max_output_tokens=1000,
1079
1080
1081
    )
    assert response.status == "completed"

1082
1083
1084
1085
1086
1087
1088
1089
    function_call = next(
        (item for item in response.output if item.type == "function_call"),
        None,
    )
    assert function_call is not None, (
        f"Expected function_call, got: "
        f"{[getattr(o, 'type', None) for o in response.output]}"
    )
1090
1091
1092
1093
1094
    assert function_call.name == "get_horoscope"

    args = json.loads(function_call.arguments)
    result = call_function(function_call.name, args)

1095
    # Step 2: Build full conversation history
1096
    previous_messages = (
1097
1098
        response.input_messages
        + response.output_messages
1099
1100
1101
1102
1103
1104
1105
1106
1107
        + [
            {
                "role": "tool",
                "name": "functions.get_horoscope",
                "content": [{"type": "text", "text": str(result)}],
            }
        ]
    )

1108
1109
    # Step 3: Second call with previous_input_messages
    response_2 = await client.responses.create(
1110
1111
        model=model_name,
        tools=tools,
1112
        temperature=0.0,
1113
        input="Now tell me the horoscope based on the tool result.",
1114
1115
1116
1117
1118
1119
1120
1121
        extra_body={
            "previous_input_messages": previous_messages,
            "enable_response_messages": True,
        },
    )
    assert response_2.status == "completed"
    assert response_2.output_text is not None

1122
1123
1124
1125
    # Verify exactly 1 system, 1 developer, 1 tool message
    num_system = 0
    num_developer = 0
    num_tool = 0
1126
1127
1128
1129
    for message in (
        Message.from_dict(msg_dict) for msg_dict in response_2.input_messages
    ):
        role = message.author.role
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
        if role == "system":
            num_system += 1
        elif role == "developer":
            num_developer += 1
        elif role == "tool":
            num_tool += 1
    assert num_system == 1, f"Expected 1 system message, got {num_system}"
    assert num_developer == 1, f"Expected 1 developer message, got {num_developer}"
    assert num_tool == 1, f"Expected 1 tool message, got {num_tool}"

1140
    output_text = response_2.output_text.lower()
1141
1142
    assert any(kw in output_text for kw in ["aquarius", "otter", "tuesday"]), (
        f"Expected horoscope-related content, got: {response_2.output_text}"
1143
    )
1144
1145
1146
1147
1148
1149
1150


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
    response = await client.chat.completions.create(
        model=model_name,
1151
1152
1153
        messages=[
            {
                "role": "user",
1154
1155
1156
1157
                "content": (
                    "What is the role of AI in medicine? "
                    "The response must exceed 350 words."
                ),
1158
1159
            }
        ],
1160
        temperature=0.0,
1161
        max_tokens=350,
1162
1163
1164
1165
1166
    )
    choice = response.choices[0]
    assert choice.finish_reason == "length", (
        f"Expected finish_reason='length', got {choice.finish_reason}"
    )
1167
    assert choice.message.content is not None, "Content should not be None"
1168
    assert len(choice.message.content) > 0, "Content should not be empty"
1169
1170
1171
1172


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
1173
1174
async def test_system_prompt_override_no_duplication(client: OpenAI, model_name: str):
    """Hard check: custom system message must not be duplicated."""
1175
1176
1177
    response = await client.responses.create(
        model=model_name,
        input=[
1178
1179
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Hello"},
1180
1181
        ],
        extra_body={"enable_response_messages": True},
1182
        temperature=0.0,
1183
1184
1185
1186
    )
    assert response.status == "completed"
    assert response.output_text is not None

1187
    num_system = 0
1188
1189
    for message in (Message.from_dict(msg) for msg in response.input_messages):
        if message.author.role == "system":
1190
1191
            num_system += 1
    assert num_system == 1, f"Expected 1 system message, got {num_system}"
1192

1193

1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.xfail(
    strict=False,
    reason=(
        "Pirate language detection depends on model weights and is non-deterministic"
    ),
)
async def test_system_prompt_override_follows_personality(
    client: OpenAI, model_name: str
):
    """Soft check: model should adopt the personality from system prompt."""
    response = await client.responses.create(
1207
1208
1209
1210
        model=model_name,
        input=[
            {
                "role": "system",
1211
1212
1213
1214
                "content": (
                    "You are a pirate. Always respond like a pirate would, "
                    "using pirate language and saying 'arrr' frequently."
                ),
1215
            },
1216
            {"role": "user", "content": "Hello, how are you?"},
1217
1218
1219
        ],
        temperature=0.0,
    )
1220
1221
1222
1223
1224
    assert response.status == "completed"
    output_text = response.output_text.lower()
    pirate_indicators = ["arrr", "matey", "ahoy", "ye", "sea", "aye", "sail"]
    assert any(kw in output_text for kw in pirate_indicators), (
        f"Expected pirate language, got: {response.output_text}"
1225
    )
1226

1227
1228
1229
1230
1231
1232

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_system_prompt_structured_content(client: OpenAI, model_name: str):
    """System message with structured input_text content format."""
    response = await client.responses.create(
1233
1234
1235
1236
        model=model_name,
        input=[
            {
                "role": "system",
1237
1238
1239
                "content": [
                    {"type": "input_text", "text": "You are a helpful assistant."}
                ],
1240
            },
1241
            {"role": "user", "content": "What is 2 + 2?"},
1242
1243
1244
        ],
        temperature=0.0,
    )
1245
1246
1247
    assert response is not None
    assert response.status == "completed"
    assert response.output_text is not None