[PERF] Use pybase64 to more quickly decode prompt embeddings (#22469)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>

[PERF] Use pybase64 to more quickly decode prompt embeddings (#22469)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
e2c8f1ed · Andrew Sansom · GitHub · 1ee5ead5 · e2c8f1ed
Unverified Commit e2c8f1ed authored Aug 07, 2025 by Andrew Sansom Committed by GitHub Aug 07, 2025
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

vllm/entrypoints/openai/serving_engine.py vllm/entrypoints/openai/serving_engine.py +3 -2

No files found.
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-import base64
 import io
 import json
 import sys
@@ -12,6 +11,7 @@ from http import HTTPStatus
 from typing import (Annotated, Any, Callable, ClassVar, Generic, Optional,
                    TypeVar, Union, cast, overload)

+import pybase64
 import torch
 from fastapi import Request
 from pydantic import BaseModel, ConfigDict, Field
@@ -1008,7 +1008,8 @@ class OpenAIServing:
    ) -> list[EmbedsPrompt]:

        def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
-            tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
+            tensor = torch.load(io.BytesIO(
+                pybase64.b64decode(embed, validate=True)),
                                weights_only=True)
            assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
                torch.float32,