test_parallel_sampling.py 3.69 KB
Newer Older
1
2
3
4
5
6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm import SamplingParams
from vllm.outputs import CompletionOutput
from vllm.sampling_params import RequestOutputKind
7
from vllm.v1.engine import EngineCoreRequest
8
9
10
11
from vllm.v1.engine.parallel_sampling import ParentRequest


def test_parent_request_to_output_stream() -> None:
12
    parent_request = ParentRequest(make_request(SamplingParams(n=2)))
13
14
15
16
17
18
19
20
    parent_request.child_requests = {"child_id_0", "child_id_1"}
    output_0 = CompletionOutput(
        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    output_1 = CompletionOutput(
        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    # Request not finished
21
22
23
24
    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
25
26
27

    # output_1 finished
    output_1.finish_reason = "ended"
28
29
    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
30
    # Finished output_1 had already returned, DO NOT returned again
31
32
    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
33
34
35

    # output_0 finished
    output_0.finish_reason = "ended"
36
37
    assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
38
    # Finished output_0 had already returned, DO NOT returned again
39
40
    assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
41
42
43
44


def test_parent_request_to_output_final_only() -> None:
    parent_request = ParentRequest(
45
        make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
46
47
48
49
50
51
52
53
54
    )
    parent_request.child_requests = {"child_id_0", "child_id_1"}
    output_0 = CompletionOutput(
        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    output_1 = CompletionOutput(
        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    # Request not finished, return nothing
55
56
    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
57
58
    # output_1 finished, but outputs won't be returned until all child requests finished
    output_1.finish_reason = "ended"
59
60
    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
61
62
    # output_0 finished, as all child requests finished, the output would be returned
    output_0.finish_reason = "ended"
63
    assert ([output_0, output_1], True) == parent_request.get_outputs(
64
65
        "child_id_0", output_0
    )
66
    assert ([output_0, output_1], True) == parent_request.get_outputs(
67
68
        "child_id_1", output_1
    )
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83


def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
    return EngineCoreRequest(
        request_id="parent_id",
        external_req_id="ext_parent_id",
        prompt_token_ids=None,
        mm_features=None,
        sampling_params=sampling_params,
        pooling_params=None,
        arrival_time=0.0,
        lora_request=None,
        cache_salt=None,
        data_parallel_rank=None,
    )