run_batch.py 5.8 KB
Newer Older
1
2
import asyncio
from io import StringIO
3
from typing import Awaitable, List
4
5
6
7
8

import aiohttp

from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.engine.async_llm_engine import AsyncLLMEngine
9
from vllm.entrypoints.logger import RequestLogger
10
11
from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                              BatchRequestOutput,
12
13
14
                                              BatchResponseData,
                                              ChatCompletionResponse,
                                              ErrorResponse)
15
16
17
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
18
from vllm.utils import FlexibleArgumentParser, random_uuid
19
from vllm.version import __version__ as VLLM_VERSION
20
21
22
23
24

logger = init_logger(__name__)


def parse_args():
25
    parser = FlexibleArgumentParser(
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
        description="vLLM OpenAI-Compatible batch runner.")
    parser.add_argument(
        "-i",
        "--input-file",
        required=True,
        type=str,
        help=
        "The path or url to a single input file. Currently supports local file "
        "paths, or the http protocol (http or https). If a URL is specified, "
        "the file should be available via HTTP GET.")
    parser.add_argument(
        "-o",
        "--output-file",
        required=True,
        type=str,
        help="The path or url to a single output file. Currently supports "
        "local file paths, or web (http or https) urls. If a URL is specified,"
        " the file should be available via HTTP PUT.")
    parser.add_argument("--response-role",
                        type=nullable_str,
                        default="assistant",
                        help="The role name to return if "
48
                        "`request.add_generation_prompt=True`.")
49
50

    parser = AsyncEngineArgs.add_cli_args(parser)
51
52
53
54
55
56
57
58

    parser.add_argument('--max-log-len',
                        type=int,
                        default=None,
                        help='Max number of prompt characters or prompt '
                        'ID numbers being printed in log.'
                        '\n\nDefault: Unlimited')

59
60
61
62
63
64
65
66
67
    return parser.parse_args()


async def read_file(path_or_url: str) -> str:
    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
        async with aiohttp.ClientSession() as session, \
                   session.get(path_or_url) as resp:
            return await resp.text()
    else:
68
        with open(path_or_url, "r", encoding="utf-8") as f:
69
70
71
72
73
74
75
76
77
78
79
80
            return f.read()


async def write_file(path_or_url: str, data: str) -> None:
    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
        async with aiohttp.ClientSession() as session, \
                   session.put(path_or_url, data=data.encode("utf-8")):
            pass
    else:
        # We should make this async, but as long as this is always run as a
        # standalone program, blocking the event loop won't effect performance
        # in this particular case.
81
        with open(path_or_url, "w", encoding="utf-8") as f:
82
83
84
85
86
87
88
            f.write(data)


async def run_request(chat_serving: OpenAIServingChat,
                      request: BatchRequestInput) -> BatchRequestOutput:
    chat_request = request.body
    chat_response = await chat_serving.create_chat_completion(chat_request)
89

90
91
92
93
    if isinstance(chat_response, ChatCompletionResponse):
        batch_output = BatchRequestOutput(
            id=f"vllm-{random_uuid()}",
            custom_id=request.custom_id,
94
95
            response=BatchResponseData(
                body=chat_response, request_id=f"vllm-batch-{random_uuid()}"),
96
97
            error=None,
        )
98
    elif isinstance(chat_response, ErrorResponse):
99
100
101
        batch_output = BatchRequestOutput(
            id=f"vllm-{random_uuid()}",
            custom_id=request.custom_id,
102
103
104
            response=BatchResponseData(
                status_code=chat_response.code,
                request_id=f"vllm-batch-{random_uuid()}"),
105
106
            error=chat_response,
        )
107
108
109
    else:
        raise ValueError("Request must not be sent in stream mode")

110
111
112
113
114
115
116
117
118
119
120
    return batch_output


async def main(args):
    if args.served_model_name is not None:
        served_model_names = args.served_model_name
    else:
        served_model_names = [args.model]

    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = AsyncLLMEngine.from_engine_args(
121
        engine_args, usage_context=UsageContext.OPENAI_BATCH_RUNNER)
122
123
124
125

    # When using single vLLM without engine_use_ray
    model_config = await engine.get_model_config()

126
127
128
129
130
    if args.disable_log_requests:
        request_logger = None
    else:
        request_logger = RequestLogger(max_log_len=args.max_log_len)

131
132
133
134
135
    openai_serving_chat = OpenAIServingChat(
        engine,
        model_config,
        served_model_names,
        args.response_role,
136
137
138
139
        lora_modules=None,
        prompt_adapters=None,
        request_logger=request_logger,
        chat_template=None,
140
141
142
    )

    # Submit all requests in the file to the engine "concurrently".
143
    response_futures: List[Awaitable[BatchRequestOutput]] = []
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
    for request_json in (await read_file(args.input_file)).strip().split("\n"):
        request = BatchRequestInput.model_validate_json(request_json)
        response_futures.append(run_request(openai_serving_chat, request))

    responses = await asyncio.gather(*response_futures)

    output_buffer = StringIO()
    for response in responses:
        print(response.model_dump_json(), file=output_buffer)

    output_buffer.seek(0)
    await write_file(args.output_file, output_buffer.read().strip())


if __name__ == "__main__":
    args = parse_args()

161
    logger.info("vLLM API server version %s", VLLM_VERSION)
162
163
164
    logger.info("args: %s", args)

    asyncio.run(main(args))