batched_chat_completions.py 6.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Examples of batched chat completions via the vLLM OpenAI-compatible API.

The /v1/chat/completions/batch endpoint accepts ``messages`` as a list of
conversations.  Each conversation is processed independently and the response
contains one choice per conversation, indexed 0, 1, ..., N-1.

Start a server first, e.g.:
    vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000

Current limitations compared to /v1/chat/completions:
    - Streaming is not supported.
    - Tool use is not supported.
    - Beam search is not supported.
"""

import json
import os

import httpx

BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000")
MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
BATCH_URL = f"{BASE_URL}/v1/chat/completions/batch"


def post_batch(payload: dict) -> dict:
    response = httpx.post(BATCH_URL, json=payload, timeout=60)
    response.raise_for_status()
    return response.json()


def main() -> None:
    print("=== Example 1a: single conversation (standard endpoint) ===")
    response = httpx.post(
        f"{BASE_URL}/v1/chat/completions",
        json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "What is the capital of Japan?"}],
        },
        timeout=60,
    )
    response.raise_for_status()
    data = response.json()
    for choice in data["choices"]:
        print(f"  [{choice['index']}] {choice['message']['content']}")

    print("\n=== Example 1b: batched plain text (2 conversations) ===")
    data = post_batch(
        {
            "model": MODEL,
            "messages": [
                [{"role": "user", "content": "What is the capital of France?"}],
                [{"role": "user", "content": "What is the capital of Japan?"}],
            ],
        }
    )
    for choice in data["choices"]:
        print(f"  [{choice['index']}] {choice['message']['content']}")

    print("\n=== Example 2: batch with regex constraint (yes|no) ===")
    data = post_batch(
        {
            "model": MODEL,
            "messages": [
                [{"role": "user", "content": "Is the sky blue? Answer yes or no."}],
                [{"role": "user", "content": "Is fire cold? Answer yes or no."}],
            ],
            "structured_outputs": {"regex": "(yes|no)"},
        }
    )
    for choice in data["choices"]:
        print(f"  [{choice['index']}] {choice['message']['content']}")

    print("\n=== Example 3: batch with json_schema ===")
    person_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string", "description": "Full name of the person"},
            "age": {"type": "integer", "description": "Age in years"},
        },
        "required": ["name", "age"],
    }
    data = post_batch(
        {
            "model": MODEL,
            "messages": [
                [
                    {
                        "role": "user",
                        "content": "Describe the person: name Alice, age 30.",
                    }
                ],
                [{"role": "user", "content": "Describe the person: name Bob, age 25."}],
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "person",
                    "strict": True,
                    "schema": person_schema,
                },
            },
        }
    )
    for choice in data["choices"]:
        person = json.loads(choice["message"]["content"])
        print(f"  [{choice['index']}] {person}")

    print("\n=== Example 4: batch book summaries ===")
    book_schema = {
        "type": "object",
        "properties": {
            "author": {
                "type": "string",
                "description": "Full name of the author",
            },
            "num_pages": {
                "type": "integer",
                "description": "Number of pages in the book",
            },
            "short_summary": {
                "type": "string",
                "description": "A one-sentence summary of the book",
            },
            "long_summary": {
                "type": "string",
                "description": (
                    "A detailed two to three sentence summary covering "
                    "the main themes and plot"
                ),
            },
        },
        "required": ["author", "num_pages", "short_summary", "long_summary"],
    }
    system_msg = {
        "role": "system",
        "content": (
            "You are a literary analyst. Extract structured information "
            "from book descriptions."
        ),
    }
    data = post_batch(
        {
            "model": MODEL,
            "messages": [
                [
                    system_msg,
                    {
                        "role": "user",
                        "content": (
                            "Extract information from this book: '1984' by George"
                            " Orwell, published in 1949, 328 pages. A dystopian"
                            " novel set in a totalitarian society ruled by Big"
                            " Brother, following Winston Smith as he secretly"
                            " rebels against the oppressive Party that surveils"
                            " and controls every aspect of life."
                        ),
                    },
                ],
                [
                    system_msg,
                    {
                        "role": "user",
                        "content": (
                            "Extract information from this book: 'The Hitchhiker's"
                            " Guide to the Galaxy' by Douglas Adams, published in"
                            " 1979, 193 pages. A comedic science fiction novel"
                            " following Arthur Dent, an ordinary Englishman who is"
                            " whisked off Earth moments before it is demolished to"
                            " make way for a hyperspace bypass, and his subsequent"
                            " absurd adventures across the universe."
                        ),
                    },
                ],
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "book_summary",
                    "strict": True,
                    "schema": book_schema,
                },
            },
        }
    )
    for choice in data["choices"]:
        book = json.loads(choice["message"]["content"])
        print(f"  [{choice['index']}] {book}")


if __name__ == "__main__":
    main()