test_context.py 20.3 KB
Newer Older
1
2
3
4
5
6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from unittest.mock import MagicMock, patch

import pytest
7
from openai_harmony import Author, Message, Role, StreamState, TextContent
8

9
10
11
12
13
from vllm.entrypoints.context import (
    HarmonyContext,
    StreamingHarmonyContext,
    TurnMetrics,
)
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from vllm.outputs import CompletionOutput, RequestOutput


def create_mock_request_output(
    prompt_token_ids=None,
    output_token_ids=None,
    num_cached_tokens=0,
    finished=True,
):
    """Helper function to create a mock RequestOutput object for testing."""
    outputs = []
    token_ids = output_token_ids if output_token_ids is not None else []
    outputs = [
        CompletionOutput(
            index=0,
            text="Test output",
            token_ids=token_ids,
            cumulative_logprob=0.0,
            logprobs=None,
            finish_reason=None,
            stop_reason=None,
        )
    ]

    return RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=prompt_token_ids,
        prompt_logprobs=None,
        outputs=outputs,
        finished=finished,
        num_cached_tokens=num_cached_tokens,
    )


49
50
51
async def generate_mock_outputs(
    num_turns, prompt_token_counts, output_token_counts, cached_token_counts=None
):
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
    """Generate a sequence of mock RequestOutput objects to simulate multiple
    turns."""
    if cached_token_counts is None:
        cached_token_counts = [0] * num_turns

    for i in range(num_turns):
        # Create mock prompt token IDs and output token IDs
        prompt_token_ids = list(range(1, prompt_token_counts[i] + 1))
        output_token_ids = list(range(1, output_token_counts[i] + 1))

        # Create and yield the RequestOutput
        yield create_mock_request_output(
            prompt_token_ids=prompt_token_ids,
            output_token_ids=output_token_ids,
            num_cached_tokens=cached_token_counts[i],
        )


@pytest.fixture
def mock_parser():
    """Set up a mock parser for tests."""
73
74
75
    with patch(
        "vllm.entrypoints.context.get_streamable_parser_for_assistant"
    ) as mock_parser_factory:
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
        # Create a mock parser object
        parser = MagicMock()
        parser.messages = []
        parser.current_channel = None
        parser.state = StreamState.EXPECT_START
        mock_parser_factory.return_value = parser
        yield parser


def test_single_turn_token_counting():
    """Test token counting behavior for a single turn."""
    # Create a context
    context = HarmonyContext(messages=[], available_tools=[])

    # Create a mock RequestOutput with specific token counts
    mock_output = create_mock_request_output(
        prompt_token_ids=[1, 2, 3, 4, 5],  # 5 prompt tokens
        output_token_ids=[6, 7, 8],  # 3 output tokens
        num_cached_tokens=2,  # 2 cached tokens
    )

    # Append the output to the context
    context.append_output(mock_output)

    # Verify the token counts
    assert context.num_prompt_tokens == 5
    assert context.num_output_tokens == 3
    assert context.num_cached_tokens == 2
    assert context.num_tool_output_tokens == 0  # No tool tokens in first turn

    # Verify internal state tracking
    assert not context.is_first_turn
108
109
110
111
112
113
    assert len(context.all_turn_metrics) == 1
    previous_turn = context.all_turn_metrics[0]
    assert previous_turn.input_tokens == 5
    assert previous_turn.output_tokens == 3
    assert previous_turn.cached_input_tokens == 2
    assert previous_turn.tool_output_tokens == 0
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128


@pytest.mark.asyncio
async def test_multi_turn_token_counting():
    """Test token counting behavior across multiple turns with tool output."""
    # Create a context
    context = HarmonyContext(messages=[], available_tools=["browser"])

    # Simulate a conversation with 3 turns
    # Turn 1: prefill 5, decode 3, tool 7
    # Turn 2: prefill 15, cached 5, decode 4, tool 1
    # Turn 3: prefill 20, cached 15, decode 5
    prompt_token_counts = [5, 15, 20]
    output_token_counts = [3, 4, 5]
    cached_token_counts = [0, 5, 15]
129
130
131
    mock_generator = generate_mock_outputs(
        3, prompt_token_counts, output_token_counts, cached_token_counts
    )
132
133

    # First turn - initial prompt and response
134
    mock_output1 = await anext(mock_generator)
135
136
137
138
139
140
141
142
    context.append_output(mock_output1)

    # At this point, we should have 5 prompt tokens and 3 output tokens
    assert context.num_prompt_tokens == 5
    assert context.num_output_tokens == 3
    assert context.num_tool_output_tokens == 0

    # Second turn - after tool output
143
    mock_output2 = await anext(mock_generator)
144
145
146
147
148
149
150
151
152
153
154
    context.append_output(mock_output2)
    # Current prompt tokens (15) - last_turn_input_tokens (5) -
    # last_turn_output_tokens (3) = 7
    expected_tool_output = 7

    assert context.num_prompt_tokens == 5 + 15
    assert context.num_output_tokens == 3 + 4
    assert context.num_tool_output_tokens == expected_tool_output
    assert context.num_cached_tokens == 5

    # Third turn - final response
155
    mock_output3 = await anext(mock_generator)
156
157
158
159
160
161
162
163
164
165
166
    context.append_output(mock_output3)
    # Additional tool output tokens from third turn:
    # Current prompt (20) - last_turn_input_tokens (15) -
    # last_turn_output_tokens (4) = 1
    expected_tool_output = 7 + 1

    assert context.num_prompt_tokens == 5 + 15 + 20
    assert context.num_output_tokens == 3 + 4 + 5
    assert context.num_tool_output_tokens == expected_tool_output
    assert context.num_cached_tokens == 5 + 15

167
168
169
170
171
172
173
174
175
    # Validate all turn metrics
    assert len(context.all_turn_metrics) == 3
    for i, turn in enumerate(context.all_turn_metrics):
        assert turn.input_tokens == prompt_token_counts[i]
        assert turn.output_tokens == output_token_counts[i]
        assert turn.cached_input_tokens == cached_token_counts[i]
    assert context.all_turn_metrics[1].tool_output_tokens == 7
    assert context.all_turn_metrics[2].tool_output_tokens == 1

176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264

def test_empty_output_tokens():
    """Test behavior when RequestOutput has empty output tokens."""
    context = HarmonyContext(messages=[], available_tools=[])

    # Create a RequestOutput with empty output tokens
    mock_output = create_mock_request_output(
        prompt_token_ids=[1, 2, 3],  # 3 prompt tokens
        output_token_ids=[],  # Empty output tokens list
        num_cached_tokens=1,
    )

    context.append_output(mock_output)

    # Should handle empty outputs gracefully
    assert context.num_prompt_tokens == 3
    assert context.num_output_tokens == 0  # No output tokens
    assert context.num_cached_tokens == 1
    assert context.num_tool_output_tokens == 0


def test_missing_prompt_token_ids():
    """Test behavior when RequestOutput has None prompt_token_ids."""
    context = HarmonyContext(messages=[], available_tools=[])

    mock_output = create_mock_request_output(
        prompt_token_ids=None,  # No prompt token IDs
        output_token_ids=[1, 2],  # 2 output tokens
        num_cached_tokens=0,
    )

    # Logger.error will be called, but we don't need to check for warnings
    # here Just ensure it doesn't raise an exception
    context.append_output(mock_output)

    # Should handle missing prompt tokens gracefully
    assert context.num_prompt_tokens == 0
    assert context.num_output_tokens == 2
    assert context.num_cached_tokens == 0
    assert context.num_tool_output_tokens == 0


def test_reasoning_tokens_counting(mock_parser):
    """Test that reasoning tokens are counted correctly."""
    context = HarmonyContext(messages=[], available_tools=[])

    # Mock parser to simulate reasoning channel
    mock_parser.current_channel = "analysis"  # Reasoning channel

    mock_output = create_mock_request_output(
        prompt_token_ids=[1, 2, 3],
        output_token_ids=[4, 5, 6, 7],  # 4 tokens, all in reasoning
        num_cached_tokens=0,
    )

    context.append_output(mock_output)

    # All output tokens should be counted as reasoning
    assert context.num_reasoning_tokens == 4
    assert context.num_output_tokens == 4


def test_zero_tokens_edge_case():
    """Test behavior with all zero token counts."""
    context = HarmonyContext(messages=[], available_tools=[])

    # Create a request with empty lists (not None) for both prompt and
    # output tokens
    mock_output = create_mock_request_output(
        prompt_token_ids=[],  # Empty prompt tokens
        output_token_ids=[],  # Empty output tokens
        num_cached_tokens=0,
    )

    context.append_output(mock_output)

    # All counts should be zero
    assert context.num_prompt_tokens == 0
    assert context.num_output_tokens == 0
    assert context.num_cached_tokens == 0
    assert context.num_tool_output_tokens == 0
    assert context.num_reasoning_tokens == 0


@pytest.mark.asyncio
async def test_single_turn_no_tool_output():
    """Test that first turn never generates tool output tokens."""
    context = HarmonyContext(
        messages=[],
265
        available_tools=["browser"],  # Tools available
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
    )

    # Even with large prompt in first turn, no tool tokens should be counted
    mock_output = create_mock_request_output(
        prompt_token_ids=list(range(100)),  # 100 tokens
        output_token_ids=[1, 2, 3],
        num_cached_tokens=0,
    )

    context.append_output(mock_output)

    # First turn should never have tool output tokens
    assert context.num_tool_output_tokens == 0
    assert context.is_first_turn is False  # Should be updated after first turn


@pytest.mark.asyncio
async def test_negative_tool_tokens_edge_case():
    """Test edge case where calculation could result in negative tool
    tokens. We should log an error and clamp the value to 0."""
    # Use patch to check if logger.error was called
    with patch("vllm.entrypoints.context.logger.error") as mock_log:
        context = HarmonyContext(messages=[], available_tools=["browser"])

        # First turn
        mock_output1 = create_mock_request_output(
            prompt_token_ids=list(range(10)),  # 10 tokens
            output_token_ids=[1, 2, 3, 4, 5],  # 5 tokens
        )
        context.append_output(mock_output1)

        # Second turn with fewer new tokens than previous output
        # This could happen in edge cases with aggressive caching
        mock_output2 = create_mock_request_output(
            prompt_token_ids=list(range(12)),  # 12 tokens (only 2 new)
            output_token_ids=[6, 7],  # 2 tokens
        )
        context.append_output(mock_output2)

        # Calculated negative tool tokens (12 - 10 - 5 = -3) should be clamped
        # to 0 and an error should be logged
        assert context.num_tool_output_tokens == 0
        assert context.num_prompt_tokens == 10 + 12
        assert context.num_output_tokens == 5 + 2

        # Verify the error was logged properly
        mock_log.assert_called_once()

        # Extract the actual log message and arguments from the call
        args, _ = mock_log.call_args
        log_message = args[0]

        # Check for key parts of the message
        assert "Negative tool output tokens calculated" in log_message
        assert "-3" in str(args)  # Check that -3 is in the arguments


@pytest.mark.asyncio
async def test_streaming_multi_turn_token_counting(mock_parser):
    """Test token counting for streaming multi-turn conversations.
326
327
328

    This test focuses on how StreamingHarmonyContext counts tokens in a
    multi-turn conversation with streaming (token-by-token) outputs and
329
330
331
332
333
    message boundaries.
    """
    # Create a streaming context
    context = StreamingHarmonyContext(messages=[], available_tools=["browser"])

334
335
336
337
    num_prompt_tokens = [3, 8, 13]
    num_output_tokens = [3, 3, 2]
    num_cached_tokens = [0, 3, 8]

338
339
340
341
342
343
344
345
346
347
348
    # Simulate three turns of conversation:
    # Turn 1: stream tokens one by one, then finish the message
    # Turn 2: new prompt, stream more tokens with a reasoning segment
    # Turn 3: new prompt with tool output and cached tokens

    # First turn: 3 tokens streamed one by one
    # First token of first turn
    context.append_output(
        create_mock_request_output(
            prompt_token_ids=[1, 2, 3],  # 3 prompt tokens
            output_token_ids=[101],  # Single token
349
            num_cached_tokens=num_cached_tokens[0],
350
            finished=False,  # Not end of message yet
351
352
        )
    )
353
354
355
356
357
358

    # Second token of first turn
    context.append_output(
        create_mock_request_output(
            output_token_ids=[102],
            finished=False,
359
360
        )
    )
361
362
363
364
365
366

    # Last token of first turn (finished=True signals end of message)
    context.append_output(
        create_mock_request_output(
            output_token_ids=[103],
            finished=True,  # End of message
367
368
        )
    )
369
370
371
372
373
374
375
376
377
378
379
380
381
382

    # Check token counts after first turn
    assert context.num_prompt_tokens == 3  # Initial prompt tokens
    assert context.num_output_tokens == 3  # Three output tokens
    assert context.num_cached_tokens == 0
    assert context.num_tool_output_tokens == 0  # No tool output in first turn
    assert context.first_tok_of_message is True  # Ready for next message

    # Second turn: reasoning tokens in analysis channel
    mock_parser.current_channel = "analysis"  # Set to reasoning channel

    # First token of second turn
    context.append_output(
        create_mock_request_output(
383
384
385
386
387
388
389
390
391
392
            prompt_token_ids=[
                1,
                2,
                3,
                101,
                102,
                103,
                4,
                5,
            ],  # 8 tokens (includes previous)
393
            output_token_ids=[201],
394
            num_cached_tokens=num_cached_tokens[1],  # Some tokens cached
395
            finished=False,
396
397
        )
    )
398
399
400
401
402
403

    # More tokens in reasoning channel
    context.append_output(
        create_mock_request_output(
            output_token_ids=[202],
            finished=False,
404
405
        )
    )
406
407
408
409
410

    context.append_output(
        create_mock_request_output(
            output_token_ids=[203],
            finished=True,  # End of reasoning message
411
412
        )
    )
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430

    # Check counts after second turn (reasoning message)
    assert context.num_prompt_tokens == 3 + 8  # Initial + second prompt
    assert context.num_output_tokens == 3 + 3  # First turn + second turn
    assert context.num_reasoning_tokens == 3  # All tokens in analysis channel
    assert context.num_cached_tokens == 3  # Cached tokens from second turn

    # Formula: this turn prompt tokens - last turn prompt - last turn output
    expected_tool_tokens = 8 - 3 - 3  # = 2
    assert context.num_tool_output_tokens == expected_tool_tokens

    # Third turn: regular output channel
    mock_parser.current_channel = "final"  # Switch back to regular channel

    # Third turn (with more cached tokens)
    context.append_output(
        create_mock_request_output(
            prompt_token_ids=[
431
432
433
434
435
436
437
438
439
440
441
442
443
                1,
                2,
                3,
                101,
                102,
                103,
                4,
                5,
                201,
                202,
                203,
                6,
                7,
444
445
            ],  # 13 tokens
            output_token_ids=[301],
446
            num_cached_tokens=num_cached_tokens[2],  # More cached tokens
447
            finished=False,
448
449
        )
    )
450
451
452
453
454

    context.append_output(
        create_mock_request_output(
            output_token_ids=[302],
            finished=True,
455
456
        )
    )
457
458

    # Final token counts check
459
460
    assert context.num_prompt_tokens == sum(num_prompt_tokens)  # All prompts
    assert context.num_output_tokens == sum(num_output_tokens)  # All outputs
461
    assert context.num_reasoning_tokens == 3  # Unchanged from second turn
462
463
464
    assert context.num_cached_tokens == sum(
        num_cached_tokens
    )  # Accumulated cached tokens
465
466
467
468

    # Additional tool tokens from third turn
    # Formula: this turn prompt - last turn prompt - last turn output
    additional_tool_tokens = 13 - 8 - 3  # = 2
469
470
471
    assert (
        context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens
    )
472

473
474
475
476
477
478
479
480
481
    # Validate all turn metrics
    assert len(context.all_turn_metrics) == 3
    for i, turn in enumerate(context.all_turn_metrics):
        assert turn.input_tokens == num_prompt_tokens[i]
        assert turn.output_tokens == num_output_tokens[i]
        assert turn.cached_input_tokens == num_cached_tokens[i]
    assert context.all_turn_metrics[1].tool_output_tokens == 2
    assert context.all_turn_metrics[2].tool_output_tokens == 2

482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499

@pytest.mark.asyncio
async def test_streaming_message_synchronization(mock_parser):
    """Test message synchronization logic from lines 413-417 in context.py.

    This test verifies that when parser.messages contains more messages than
    the context's _messages (minus initial messages), the context properly
    extends its message list with the new parser messages.
    """

    # Create a streaming context with some initial messages
    initial_messages = [
        Message(
            author=Author(role=Role.USER, name="user"),
            content=[TextContent(text="Hello")],
            recipient=Role.ASSISTANT,
        )
    ]
500
    context = StreamingHarmonyContext(messages=initial_messages, available_tools=[])
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517

    # Verify initial state
    assert len(context._messages) == 1
    assert context.num_init_messages == 1

    # Mock parser to have more messages than context
    # Simulate parser having processed 3 new messages
    mock_parser.messages = [
        Message(
            author=Author(role=Role.ASSISTANT, name="assistant"),
            content=[TextContent(text="Response 1")],
            recipient=Role.USER,
        ),
    ]

    # This should trigger the message synchronization logic
    context.append_output(
518
519
520
521
        create_mock_request_output(
            prompt_token_ids=[1, 2, 3], output_token_ids=[101], finished=False
        )
    )
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542

    # Verify that messages were synchronized
    assert len(context._messages) == 2

    # Verify the new messages were added correctly
    assert context._messages[1].content[0].text == "Response 1"

    # Test the specific condition from line 413-414:
    # len(self._messages) - self.num_init_messages < len(self.parser.messages)
    messages_minus_init = len(context._messages) - context.num_init_messages
    parser_messages_count = len(mock_parser.messages)

    # After synchronization, they should be equal (no longer less than)
    assert messages_minus_init == parser_messages_count

    # Test edge case: add one more parser message
    mock_parser.messages.append(
        Message(
            author=Author(role=Role.ASSISTANT, name="assistant"),
            content=[TextContent(text="Response 4")],
            recipient=Role.USER,
543
544
        )
    )
545
546

    # Create another output to trigger synchronization again
547
548
549
    mock_output2 = create_mock_request_output(
        prompt_token_ids=[1, 2, 3], output_token_ids=[102], finished=True
    )
550
551
552
553
554
555
556

    context.append_output(mock_output2)

    # Verify the fourth message was added, num_init_messages is still 1
    assert len(context._messages) == 3
    assert context.num_init_messages == 1
    assert context._messages[2].content[0].text == "Response 4"
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599


def test_turn_metrics_copy_and_reset():
    """Test TurnMetrics copy and reset methods work correctly."""
    # Create a TurnMetrics with specific values
    original_metrics = TurnMetrics(
        input_tokens=10,
        output_tokens=20,
        cached_input_tokens=5,
        tool_output_tokens=3,
    )

    # Test copy functionality
    copied_metrics = original_metrics.copy()

    # Verify copy has same values
    assert copied_metrics.input_tokens == 10
    assert copied_metrics.output_tokens == 20
    assert copied_metrics.cached_input_tokens == 5
    assert copied_metrics.tool_output_tokens == 3

    # Verify they are separate objects
    assert copied_metrics is not original_metrics

    # Modify copy to ensure independence
    copied_metrics.input_tokens = 999
    assert original_metrics.input_tokens == 10  # Original unchanged
    assert copied_metrics.input_tokens == 999

    # Test reset functionality
    original_metrics.reset()

    # Verify all fields are reset to zero
    assert original_metrics.input_tokens == 0
    assert original_metrics.output_tokens == 0
    assert original_metrics.cached_input_tokens == 0
    assert original_metrics.tool_output_tokens == 0

    # Verify copied metrics are unaffected by reset
    assert copied_metrics.input_tokens == 999
    assert copied_metrics.output_tokens == 20
    assert copied_metrics.cached_input_tokens == 5
    assert copied_metrics.tool_output_tokens == 3