backend_request_func.py 24.7 KB
Newer Older
zhuwenwen's avatar
zhuwenwen committed
1
# SPDX-License-Identifier: Apache-2.0
zhuwenwen's avatar
zhuwenwen committed
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
zhuwenwen's avatar
zhuwenwen committed
3

zhuwenwen's avatar
zhuwenwen committed
4
import io
zhuwenwen's avatar
zhuwenwen committed
5
6
7
8
9
10
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field
zhuwenwen's avatar
zhuwenwen committed
11
from typing import Optional, Union
zhuwenwen's avatar
zhuwenwen committed
12
13
14
15

import aiohttp
import huggingface_hub.constants
from tqdm.asyncio import tqdm
zhuwenwen's avatar
zhuwenwen committed
16
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
zhuwenwen's avatar
zhuwenwen committed
17

zhuwenwen's avatar
zhuwenwen committed
18
19
20
# NOTE(simon): do not import vLLM here so the benchmark script
# can run without vLLM installed.

zhuwenwen's avatar
zhuwenwen committed
21
22
23
24
25
26
27
28
29
30
31
32
33
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)


@dataclass
class RequestFuncInput:
    prompt: str
    api_url: str
    prompt_len: int
    output_len: int
    model: str
    model_name: Optional[str] = None
    logprobs: Optional[int] = None
    extra_body: Optional[dict] = None
34
    multi_modal_content: Optional[dict | list[dict]] = None
zhuwenwen's avatar
zhuwenwen committed
35
    ignore_eos: bool = False
zhuwenwen's avatar
zhuwenwen committed
36
    language: Optional[str] = None
zhuwenwen's avatar
zhuwenwen committed
37
    request_id: Optional[str] = None
zhuwenwen's avatar
zhuwenwen committed
38
39
40
41
42
43
44
45
46


@dataclass
class RequestFuncOutput:
    generated_text: str = ""
    success: bool = False
    latency: float = 0.0
    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
zhuwenwen's avatar
zhuwenwen committed
47
    itl: list[float] = field(default_factory=list)  # list of inter-token latencies
zhuwenwen's avatar
zhuwenwen committed
48
49
50
51
52
53
54
55
56
57
58
59
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""


async def async_request_tgi(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

zhuwenwen's avatar
zhuwenwen committed
60
61
62
    async with aiohttp.ClientSession(
        trust_env=True, timeout=AIOHTTP_TIMEOUT
    ) as session:
zhuwenwen's avatar
zhuwenwen committed
63
64
65
66
67
68
        params = {
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
            "truncate": request_func_input.prompt_len,
zhuwenwen's avatar
zhuwenwen committed
69
            "ignore_eos_token": request_func_input.ignore_eos,
zhuwenwen's avatar
zhuwenwen committed
70
71
72
73
74
        }
        payload = {
            "inputs": request_func_input.prompt,
            "parameters": params,
        }
zhuwenwen's avatar
zhuwenwen committed
75
76
77
        headers = None
        if request_func_input.request_id:
            headers = {"x-request-id": request_func_input.request_id}
zhuwenwen's avatar
zhuwenwen committed
78
79
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
zhuwenwen's avatar
zhuwenwen committed
80
81
82
83
        if request_func_input.ignore_eos:
            output.output_tokens = request_func_input.output_len
        else:
            output.output_tokens = None
zhuwenwen's avatar
zhuwenwen committed
84
85
86
87
88

        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
zhuwenwen's avatar
zhuwenwen committed
89
90
91
            async with session.post(
                url=api_url, json=payload, headers=headers
            ) as response:
zhuwenwen's avatar
zhuwenwen committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk_bytes = chunk_bytes.decode("utf-8")

                        # NOTE: Sometimes TGI returns a ping response without
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
                        chunk = chunk_bytes.removeprefix("data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
                        # First token
                        if ttft == 0.0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft

                        # Decoding phase
                        else:
zhuwenwen's avatar
zhuwenwen committed
114
                            output.itl.append(timestamp - most_recent_timestamp)
zhuwenwen's avatar
zhuwenwen committed
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

                        most_recent_timestamp = timestamp

                    output.latency = most_recent_timestamp - st
                    output.success = True
                    output.generated_text = data["generated_text"]
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

        if pbar:
            pbar.update(1)
        return output


async def async_request_trt_llm(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

zhuwenwen's avatar
zhuwenwen committed
141
142
143
    async with aiohttp.ClientSession(
        trust_env=True, timeout=AIOHTTP_TIMEOUT
    ) as session:
zhuwenwen's avatar
zhuwenwen committed
144
145
146
147
148
149
150
151
152
153
        payload = {
            "accumulate_tokens": True,
            "text_input": request_func_input.prompt,
            "temperature": 0.0,
            "top_p": 1.0,
            "max_tokens": request_func_input.output_len,
            "stream": True,
        }
        if request_func_input.ignore_eos:
            payload["min_length"] = request_func_input.output_len
zhuwenwen's avatar
zhuwenwen committed
154
155
156
        headers = None
        if request_func_input.request_id:
            headers = {"x-request-id": request_func_input.request_id}
zhuwenwen's avatar
zhuwenwen committed
157
158
159
160
161
162
163
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
zhuwenwen's avatar
zhuwenwen committed
164
165
166
            async with session.post(
                url=api_url, json=payload, headers=headers
            ) as response:
zhuwenwen's avatar
zhuwenwen committed
167
168
169
170
171
172
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

zhuwenwen's avatar
zhuwenwen committed
173
                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
zhuwenwen's avatar
zhuwenwen committed
174
175
176
177
178
179
180
181
182
183
184

                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
                        timestamp = time.perf_counter()
                        # First token
                        if ttft == 0.0:
                            ttft = timestamp - st
                            output.ttft = ttft

                        # Decoding phase
                        else:
zhuwenwen's avatar
zhuwenwen committed
185
                            output.itl.append(timestamp - most_recent_timestamp)
zhuwenwen's avatar
zhuwenwen committed
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208

                        most_recent_timestamp = timestamp

                    output.latency = most_recent_timestamp - st
                    output.success = True

                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

        if pbar:
            pbar.update(1)
        return output


async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
zhuwenwen's avatar
zhuwenwen committed
209
210
211
212
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
        "OpenAI Completions API URL must end with 'completions' or 'profile'."
    )
zhuwenwen's avatar
zhuwenwen committed
213

zhuwenwen's avatar
zhuwenwen committed
214
215
216
    async with aiohttp.ClientSession(
        trust_env=True, timeout=AIOHTTP_TIMEOUT
    ) as session:
zhuwenwen's avatar
zhuwenwen committed
217
        payload = {
zhuwenwen's avatar
zhuwenwen committed
218
            "model": request_func_input.model,
zhuwenwen's avatar
zhuwenwen committed
219
220
221
222
223
            "prompt": request_func_input.prompt,
            "max_tokens": request_func_input.output_len,
            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
            "top_p": 1.0,
        }
zhuwenwen's avatar
zhuwenwen committed
224
        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
zhuwenwen's avatar
zhuwenwen committed
225
226
        if request_func_input.request_id:
            headers["x-request-id"] = request_func_input.request_id
zhuwenwen's avatar
zhuwenwen committed
227

zhuwenwen's avatar
zhuwenwen committed
228
229
230
231
232
233
234
235
236
237
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
        # will use 0 as placeholder.
        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
        output.ttft = 0

        st = time.perf_counter()
        try:
zhuwenwen's avatar
zhuwenwen committed
238
239
240
            async with session.post(
                url=api_url, json=payload, headers=headers
            ) as response:
zhuwenwen's avatar
zhuwenwen committed
241
242
243
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
zhuwenwen's avatar
zhuwenwen committed
244
                    if "choices" in parsed_resp:
zhuwenwen's avatar
zhuwenwen committed
245
                        output.generated_text = parsed_resp["choices"][0]["text"]
zhuwenwen's avatar
zhuwenwen committed
246
247
248
                    elif "text" in parsed_resp:
                        output.generated_text = parsed_resp["text"][0]
                    else:
zhuwenwen's avatar
zhuwenwen committed
249
250
251
252
                        output.error = (
                            "Unexpected response format: "
                            "neither 'choices' nor 'text' found"
                        )
zhuwenwen's avatar
zhuwenwen committed
253
                        output.success = False
zhuwenwen's avatar
zhuwenwen committed
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
                    output.success = True
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

        if pbar:
            pbar.update(1)
        return output


async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
    api_url = request_func_input.api_url
zhuwenwen's avatar
zhuwenwen committed
273
274
275
    assert api_url.endswith(("completions", "profile")), (
        "OpenAI Completions API URL must end with 'completions' or 'profile'."
    )
zhuwenwen's avatar
zhuwenwen committed
276

zhuwenwen's avatar
zhuwenwen committed
277
278
279
    async with aiohttp.ClientSession(
        trust_env=True, timeout=AIOHTTP_TIMEOUT
    ) as session:
zhuwenwen's avatar
zhuwenwen committed
280
        payload = {
zhuwenwen's avatar
zhuwenwen committed
281
282
283
            "model": request_func_input.model_name
            if request_func_input.model_name
            else request_func_input.model,
zhuwenwen's avatar
zhuwenwen committed
284
285
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
zhuwenwen's avatar
zhuwenwen committed
286
            "repetition_penalty": 1.0,
zhuwenwen's avatar
zhuwenwen committed
287
288
289
290
291
292
293
294
295
296
297
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
            "stream_options": {
                "include_usage": True,
            },
        }
        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
zhuwenwen's avatar
zhuwenwen committed
298
        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
zhuwenwen's avatar
zhuwenwen committed
299
300
        if request_func_input.request_id:
            headers["x-request-id"] = request_func_input.request_id
zhuwenwen's avatar
zhuwenwen committed
301
302
303
304
305
306
307
308

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

        generated_text = ""
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
zhuwenwen's avatar
zhuwenwen committed
309
310
311
            async with session.post(
                url=api_url, json=payload, headers=headers
            ) as response:
zhuwenwen's avatar
zhuwenwen committed
312
313
314
315
316
317
318
                if response.status == 200:
                    first_chunk_received = False
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

zhuwenwen's avatar
zhuwenwen committed
319
                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
zhuwenwen's avatar
zhuwenwen committed
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
                        if chunk != "[DONE]":
                            data = json.loads(chunk)

                            # NOTE: Some completion API might have a last
                            # usage summary response without a token so we
                            # want to check a token was generated
                            if choices := data.get("choices"):
                                # Note that text could be empty here
                                # e.g. for special tokens
                                text = choices[0].get("text")
                                timestamp = time.perf_counter()
                                # First token
                                if not first_chunk_received:
                                    first_chunk_received = True
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft

                                # Decoding phase
                                else:
zhuwenwen's avatar
zhuwenwen committed
339
                                    output.itl.append(timestamp - most_recent_timestamp)
zhuwenwen's avatar
zhuwenwen committed
340
341
342

                                most_recent_timestamp = timestamp
                                generated_text += text or ""
zhuwenwen's avatar
zhuwenwen committed
343
344
                            if usage := data.get("usage"):
                                output.output_tokens = usage.get("completion_tokens")
zhuwenwen's avatar
zhuwenwen committed
345
346
347
348
349
350
                    if first_chunk_received:
                        output.success = True
                    else:
                        output.success = False
                        output.error = (
                            "Never received a valid chunk to calculate TTFT."
zhuwenwen's avatar
zhuwenwen committed
351
352
                            "This response will be marked as failed!"
                        )
zhuwenwen's avatar
zhuwenwen committed
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
                    output.generated_text = generated_text
                    output.latency = most_recent_timestamp - st
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

    if pbar:
        pbar.update(1)
    return output


async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
    api_url = request_func_input.api_url
zhuwenwen's avatar
zhuwenwen committed
373
374
375
    assert api_url.endswith(("chat/completions", "profile")), (
        "OpenAI Chat Completions API URL must end with 'chat/completions'."
    )
zhuwenwen's avatar
zhuwenwen committed
376

zhuwenwen's avatar
zhuwenwen committed
377
378
379
    async with aiohttp.ClientSession(
        trust_env=True, timeout=AIOHTTP_TIMEOUT
    ) as session:
zhuwenwen's avatar
zhuwenwen committed
380
381
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
382
383
384
385
386
387
388
389
390
            mm_content = request_func_input.multi_modal_content
            if isinstance(mm_content, list):
                content.extend(mm_content)
            elif isinstance(mm_content, dict):
                content.append(mm_content)
            else:
                raise TypeError(
                    "multi_modal_content must be a dict or list[dict] for openai-chat"
                )
zhuwenwen's avatar
zhuwenwen committed
391
        payload = {
zhuwenwen's avatar
zhuwenwen committed
392
393
394
            "model": request_func_input.model_name
            if request_func_input.model_name
            else request_func_input.model,
zhuwenwen's avatar
zhuwenwen committed
395
            "messages": [
zhuwenwen's avatar
zhuwenwen committed
396
                {"role": "user", "content": content},
zhuwenwen's avatar
zhuwenwen committed
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
            ],
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
            "stream_options": {
                "include_usage": True,
            },
        }
        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }
zhuwenwen's avatar
zhuwenwen committed
413
414
        if request_func_input.request_id:
            headers["x-request-id"] = request_func_input.request_id
zhuwenwen's avatar
zhuwenwen committed
415
416
417
418
419
420
421
422
423

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

        generated_text = ""
        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
zhuwenwen's avatar
zhuwenwen committed
424
425
426
            async with session.post(
                url=api_url, json=payload, headers=headers
            ) as response:
zhuwenwen's avatar
zhuwenwen committed
427
428
429
430
431
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
zhuwenwen's avatar
zhuwenwen committed
432
433
434
435
436
437
438
                        chunk_bytes = chunk_bytes.decode("utf-8")
                        # NOTE: SSE comments (often used as pings) start with a colon.
                        # These are not JSON data payload and should be skipped.
                        if chunk_bytes.startswith(":"):
                            continue

                        chunk = chunk_bytes.removeprefix("data: ")
zhuwenwen's avatar
zhuwenwen committed
439
440
441
442
443
444
445
446
447
448
449
450
451
452

                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)

                            if choices := data.get("choices"):
                                content = choices[0]["delta"].get("content")
                                # First token
                                if ttft == 0.0:
                                    ttft = timestamp - st
                                    output.ttft = ttft

                                # Decoding phase
                                else:
zhuwenwen's avatar
zhuwenwen committed
453
                                    output.itl.append(timestamp - most_recent_timestamp)
zhuwenwen's avatar
zhuwenwen committed
454
455
456

                                generated_text += content or ""
                            elif usage := data.get("usage"):
zhuwenwen's avatar
zhuwenwen committed
457
                                output.output_tokens = usage.get("completion_tokens")
zhuwenwen's avatar
zhuwenwen committed
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476

                            most_recent_timestamp = timestamp

                    output.generated_text = generated_text
                    output.success = True
                    output.latency = most_recent_timestamp - st
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

    if pbar:
        pbar.update(1)
    return output


zhuwenwen's avatar
zhuwenwen committed
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile

    api_url = request_func_input.api_url
    assert api_url.endswith(("transcriptions", "translations")), (
        "OpenAI Chat Completions API URL must end with 'transcriptions' "
    )
    "or `translations`."

    async with aiohttp.ClientSession(
        trust_env=True, timeout=AIOHTTP_TIMEOUT
    ) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        payload = {
            "model": request_func_input.model_name
            if request_func_input.model_name
            else request_func_input.model,
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
            "language": "en",
            # Flattened due to multipart/form-data
            "stream_include_usage": True,
            "stream_continuous_usage_stats": True,
        }
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }
zhuwenwen's avatar
zhuwenwen committed
511
512
        if request_func_input.request_id:
            headers["x-request-id"] = request_func_input.request_id
zhuwenwen's avatar
zhuwenwen committed
513
514
515
516
517
518
519
520

        # Send audio file
        def to_bytes(y, sr):
            buffer = io.BytesIO()
            soundfile.write(buffer, y, sr, format="WAV")
            buffer.seek(0)
            return buffer

521
522
523
524
        mm_audio = request_func_input.multi_modal_content
        if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
            raise TypeError("multi_modal_content must be a dict containing 'audio'")
        with to_bytes(*mm_audio["audio"]) as f:
zhuwenwen's avatar
zhuwenwen committed
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
            form = aiohttp.FormData()
            form.add_field("file", f, content_type="audio/wav")
            for key, value in payload.items():
                form.add_field(key, str(value))

            output = RequestFuncOutput()
            output.prompt_len = request_func_input.prompt_len

            generated_text = ""
            ttft = 0.0
            st = time.perf_counter()
            most_recent_timestamp = st
            try:
                async with session.post(
                    url=api_url, data=form, headers=headers
                ) as response:
                    if response.status == 200:
                        async for chunk_bytes in response.content:
                            chunk_bytes = chunk_bytes.strip()
                            if not chunk_bytes:
                                continue

                            chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                            if chunk != "[DONE]":
                                timestamp = time.perf_counter()
                                data = json.loads(chunk)

                                if choices := data.get("choices"):
                                    content = choices[0]["delta"].get("content")
                                    # First token
                                    if ttft == 0.0:
                                        ttft = timestamp - st
                                        output.ttft = ttft

                                    # Decoding phase
                                    else:
                                        output.itl.append(
                                            timestamp - most_recent_timestamp
                                        )

                                    generated_text += content or ""
                                elif usage := data.get("usage"):
                                    output.output_tokens = usage.get(
                                        "completion_tokens"
                                    )

                                most_recent_timestamp = timestamp

                        output.generated_text = generated_text
                        output.success = True
                        output.latency = most_recent_timestamp - st
                    else:
                        output.error = response.reason or ""
                        output.success = False
            except Exception:
                output.success = False
                exc_info = sys.exc_info()
                output.error = "".join(traceback.format_exception(*exc_info))

        if pbar:
            pbar.update(1)
        return output


zhuwenwen's avatar
zhuwenwen committed
589
def get_model(pretrained_model_name_or_path: str) -> str:
zhuwenwen's avatar
zhuwenwen committed
590
    if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
zhuwenwen's avatar
zhuwenwen committed
591
592
        from modelscope import snapshot_download

zhuwenwen's avatar
zhuwenwen committed
593
594
595
596
597
598
599
600
        from vllm.model_executor.model_loader.weight_utils import get_lock

        # Use file lock to prevent multiple processes from
        # downloading the same model weights at the same time.
        with get_lock(pretrained_model_name_or_path):
            model_path = snapshot_download(
                model_id=pretrained_model_name_or_path,
                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
zhuwenwen's avatar
zhuwenwen committed
601
602
                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
            )
zhuwenwen's avatar
zhuwenwen committed
603

zhuwenwen's avatar
zhuwenwen committed
604
            return model_path
zhuwenwen's avatar
zhuwenwen committed
605
606
607
608
609
610
611
612
613
614
    return pretrained_model_name_or_path


def get_tokenizer(
    pretrained_model_name_or_path: str,
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    **kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    if pretrained_model_name_or_path is not None and not os.path.exists(
zhuwenwen's avatar
zhuwenwen committed
615
616
617
        pretrained_model_name_or_path
    ):
        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
zhuwenwen's avatar
zhuwenwen committed
618
619
    if tokenizer_mode == "slow":
        if kwargs.get("use_fast", False):
zhuwenwen's avatar
zhuwenwen committed
620
            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
zhuwenwen's avatar
zhuwenwen committed
621
622
623
624
625
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
            from vllm.transformers_utils.tokenizer import MistralTokenizer
        except ImportError as e:
zhuwenwen's avatar
zhuwenwen committed
626
627
628
629
630
631
            raise ImportError(
                "MistralTokenizer requires vllm package.\n"
                "Please install it with `pip install vllm` "
                "to use mistral tokenizer mode."
            ) from e
        return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
zhuwenwen's avatar
zhuwenwen committed
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
    else:
        return AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path,
            trust_remote_code=trust_remote_code,
            **kwargs,
        )


ASYNC_REQUEST_FUNCS = {
    "tgi": async_request_tgi,
    "vllm": async_request_openai_completions,
    "lmdeploy": async_request_openai_completions,
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
zhuwenwen's avatar
zhuwenwen committed
647
    "openai-audio": async_request_openai_audio,
zhuwenwen's avatar
zhuwenwen committed
648
649
650
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
zhuwenwen's avatar
zhuwenwen committed
651
652
653
654
655
656
657
658
    "llama.cpp": async_request_openai_completions,
}

OPENAI_COMPATIBLE_BACKENDS = [
    k
    for k, v in ASYNC_REQUEST_FUNCS.items()
    if v in (async_request_openai_completions, async_request_openai_chat_completions)
]