Unverified Commit 2e660c24 authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Frontend] Binary embedding response does not return metadata by setting...


[Frontend] Binary embedding response does not return metadata by setting encoding_format to bytes_only. (#30249)
Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent 408cf42f
...@@ -16,6 +16,7 @@ from vllm.utils.serial_utils import ( ...@@ -16,6 +16,7 @@ from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE, EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS, ENDIANNESS,
MetadataItem, MetadataItem,
build_metadata_items,
decode_pooling_output, decode_pooling_output,
) )
...@@ -38,6 +39,11 @@ def parse_args(): ...@@ -38,6 +39,11 @@ def parse_args():
def main(args): def main(args):
api_url = f"http://{args.host}:{args.port}/v1/embeddings" api_url = f"http://{args.host}:{args.port}/v1/embeddings"
model_name = args.model model_name = args.model
embedding_size = 0
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
# The OpenAI client does not support the bytes encoding_format. # The OpenAI client does not support the bytes encoding_format.
# The OpenAI client does not support the embed_dtype and endianness parameters. # The OpenAI client does not support the embed_dtype and endianness parameters.
...@@ -45,7 +51,7 @@ def main(args): ...@@ -45,7 +51,7 @@ def main(args):
for endianness in ENDIANNESS: for endianness in ENDIANNESS:
prompt = { prompt = {
"model": model_name, "model": model_name,
"input": "vLLM is great!", "input": input_texts,
"encoding_format": "bytes", "encoding_format": "bytes",
"embed_dtype": embed_dtype, "embed_dtype": embed_dtype,
"endianness": endianness, "endianness": endianness,
...@@ -57,7 +63,34 @@ def main(args): ...@@ -57,7 +63,34 @@ def main(args):
embedding = decode_pooling_output(items=items, body=body) embedding = decode_pooling_output(items=items, body=body)
embedding = [x.to(torch.float32) for x in embedding] embedding = [x.to(torch.float32) for x in embedding]
embedding = torch.cat(embedding) embedding = torch.stack(embedding)
embedding_size = embedding.shape[-1]
print(embed_dtype, endianness, embedding.shape)
# The vllm server always sorts the returned embeddings in the order of input. So
# returning metadata is not necessary. You can set encoding_format to bytes_only
# to let the server not return metadata.
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
prompt = {
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
}
response = post_http_request(prompt=prompt, api_url=api_url)
body = response.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(embedding_size,),
n_request=len(input_texts),
)
embedding = decode_pooling_output(items=items, body=body)
embedding = [x.to(torch.float32) for x in embedding]
embedding = torch.stack(embedding)
print(embed_dtype, endianness, embedding.shape) print(embed_dtype, endianness, embedding.shape)
......
...@@ -24,6 +24,7 @@ from vllm.utils.serial_utils import ( ...@@ -24,6 +24,7 @@ from vllm.utils.serial_utils import (
ENDIANNESS, ENDIANNESS,
MetadataItem, MetadataItem,
binary2tensor, binary2tensor,
build_metadata_items,
decode_pooling_output, decode_pooling_output,
) )
...@@ -344,6 +345,55 @@ async def test_bytes_embed_dtype_and_endianness( ...@@ -344,6 +345,55 @@ async def test_bytes_embed_dtype_and_endianness(
) )
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
responses_float = await client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float"
)
float_data = [d.embedding for d in responses_float.data]
embedding_size = len(float_data[0])
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
assert "metadata" not in responses_bytes.headers
body = responses_bytes.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(embedding_size,),
n_request=len(input_texts),
)
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
......
...@@ -18,6 +18,7 @@ from vllm.utils.serial_utils import ( ...@@ -18,6 +18,7 @@ from vllm.utils.serial_utils import (
ENDIANNESS, ENDIANNESS,
MetadataItem, MetadataItem,
binary2tensor, binary2tensor,
build_metadata_items,
decode_pooling_output, decode_pooling_output,
) )
...@@ -352,6 +353,61 @@ async def test_bytes_embed_dtype_and_endianness( ...@@ -352,6 +353,61 @@ async def test_bytes_embed_dtype_and_endianness(
) )
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_bytes_only_embed_dtype_and_endianness(
server: RemoteOpenAIServer, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
url = server.url_for("pooling")
float_response = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "float",
},
)
responses_float = PoolingResponse.model_validate(float_response.json())
float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
n_tokens = responses_float.usage.prompt_tokens // len(input_texts)
for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
for endianness in ENDIANNESS:
responses_bytes = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)
assert "metadata" not in responses_bytes.headers
body = responses_bytes.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(n_tokens, 1),
n_request=len(input_texts),
)
bytes_data = decode_pooling_output(items=items, body=body)
bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data]
check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=bytes_data,
name_0="float_data",
name_1="bytes_data",
tol=1e-2,
)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
......
...@@ -59,8 +59,8 @@ async def create_embedding( ...@@ -59,8 +59,8 @@ async def create_embedding(
return JSONResponse(content=generator.model_dump()) return JSONResponse(content=generator.model_dump())
elif isinstance(generator, EmbeddingBytesResponse): elif isinstance(generator, EmbeddingBytesResponse):
return StreamingResponse( return StreamingResponse(
content=generator.body, content=generator.content,
headers={"metadata": generator.metadata}, headers=generator.headers,
media_type=generator.media_type, media_type=generator.media_type,
) )
......
...@@ -203,6 +203,6 @@ class EmbeddingResponse(OpenAIBaseModel): ...@@ -203,6 +203,6 @@ class EmbeddingResponse(OpenAIBaseModel):
class EmbeddingBytesResponse(OpenAIBaseModel): class EmbeddingBytesResponse(OpenAIBaseModel):
body: list[bytes] content: list[bytes]
metadata: str headers: dict[str, str] | None = None
media_type: str = "application/octet-stream" media_type: str = "application/octet-stream"
...@@ -163,29 +163,35 @@ class EmbeddingMixin(OpenAIServing): ...@@ -163,29 +163,35 @@ class EmbeddingMixin(OpenAIServing):
usage=usage, usage=usage,
) )
def encode_bytes(): def encode_bytes(bytes_only: bool) -> EmbeddingBytesResponse:
body, items, usage = encode_pooling_bytes( content, items, usage = encode_pooling_bytes(
pooling_outputs=final_res_batch_checked, pooling_outputs=final_res_batch_checked,
embed_dtype=embed_dtype, embed_dtype=embed_dtype,
endianness=endianness, endianness=endianness,
) )
metadata = { headers = (
None
if bytes_only
else {
"metadata": json.dumps(
{
"id": ctx.request_id, "id": ctx.request_id,
"created": ctx.created_time, "created": ctx.created_time,
"model": ctx.model_name, "model": ctx.model_name,
"data": items, "data": items,
"usage": usage, "usage": usage,
} }
return EmbeddingBytesResponse(
body=body,
metadata=json.dumps(metadata),
) )
}
)
return EmbeddingBytesResponse(content=content, headers=headers)
if encoding_format == "float" or encoding_format == "base64": if encoding_format == "float" or encoding_format == "base64":
return encode_float_base64() return encode_float_base64()
elif encoding_format == "bytes": elif encoding_format == "bytes" or encoding_format == "bytes_only":
return encode_bytes() return encode_bytes(bytes_only=encoding_format == "bytes_only")
else: else:
assert_never(encoding_format) assert_never(encoding_format)
......
...@@ -55,8 +55,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): ...@@ -55,8 +55,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump()) return JSONResponse(content=generator.model_dump())
elif isinstance(generator, PoolingBytesResponse): elif isinstance(generator, PoolingBytesResponse):
return StreamingResponse( return StreamingResponse(
content=generator.body, content=generator.content,
headers={"metadata": generator.metadata}, headers=generator.headers,
media_type=generator.media_type, media_type=generator.media_type,
) )
......
...@@ -143,6 +143,6 @@ class PoolingResponse(OpenAIBaseModel): ...@@ -143,6 +143,6 @@ class PoolingResponse(OpenAIBaseModel):
class PoolingBytesResponse(OpenAIBaseModel): class PoolingBytesResponse(OpenAIBaseModel):
body: list[bytes] content: list[bytes]
metadata: str headers: dict[str, str] | None = None
media_type: str = "application/octet-stream" media_type: str = "application/octet-stream"
...@@ -314,29 +314,38 @@ class OpenAIServingPooling(OpenAIServing): ...@@ -314,29 +314,38 @@ class OpenAIServingPooling(OpenAIServing):
usage=usage, usage=usage,
) )
def encode_bytes(): def encode_bytes(bytes_only: bool) -> PoolingBytesResponse:
body, items, usage = encode_pooling_bytes( content, items, usage = encode_pooling_bytes(
pooling_outputs=final_res_batch, pooling_outputs=final_res_batch,
embed_dtype=embed_dtype, embed_dtype=embed_dtype,
endianness=endianness, endianness=endianness,
) )
metadata = { headers = (
None
if bytes_only
else {
"metadata": json.dumps(
{
"id": request_id, "id": request_id,
"created": created_time, "created": created_time,
"model": model_name, "model": model_name,
"data": items, "data": items,
"usage": usage, "usage": usage,
} }
)
}
)
return PoolingBytesResponse( return PoolingBytesResponse(
body=body, content=content,
metadata=json.dumps(metadata), headers=headers,
) )
if encoding_format == "float" or encoding_format == "base64": if encoding_format == "float" or encoding_format == "base64":
return encode_float_base64() return encode_float_base64()
elif encoding_format == "bytes": elif encoding_format == "bytes" or encoding_format == "bytes_only":
return encode_bytes() return encode_bytes(bytes_only=encoding_format == "bytes_only")
else: else:
assert_never(encoding_format) assert_never(encoding_format)
......
...@@ -2,15 +2,19 @@ ...@@ -2,15 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64 import base64
import io import io
import math
import sys import sys
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal from typing import TYPE_CHECKING, Any, Literal
import numpy as np import numpy as np
import torch import torch
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm import PoolingRequestOutput if TYPE_CHECKING:
from vllm import PoolingRequestOutput
else:
PoolingRequestOutput = Any
sys_byteorder = sys.byteorder sys_byteorder = sys.byteorder
...@@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = { ...@@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = {
"fp8_e5m2": torch.float8_e5m2, "fp8_e5m2": torch.float8_e5m2,
} }
EMBED_DTYPE_TO_N_BYTES = {
"float32": 4,
"float16": 2,
"bfloat16": 2,
"fp8_e4m3": 1,
"fp8_e5m2": 1,
}
EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = { EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = {
"float32": torch.float32, "float32": torch.float32,
...@@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"] ...@@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"]
EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"]
Endianness = Literal["native", "big", "little"] Endianness = Literal["native", "big", "little"]
EncodingFormat = Literal["float", "base64", "bytes"] EncodingFormat = Literal["float", "base64", "bytes", "bytes_only"]
def tensor2base64(x: torch.Tensor) -> str: def tensor2base64(x: torch.Tensor) -> str:
...@@ -114,7 +126,7 @@ def encode_pooling_output( ...@@ -114,7 +126,7 @@ def encode_pooling_output(
elif encoding_format == "base64": elif encoding_format == "base64":
embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness) embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
return base64.b64encode(embedding_bytes).decode("utf-8") return base64.b64encode(embedding_bytes).decode("utf-8")
elif encoding_format == "bytes": elif encoding_format == "bytes" or encoding_format == "bytes_only":
return tensor2binary(output.outputs.data, embed_dtype, endianness) return tensor2binary(output.outputs.data, embed_dtype, endianness)
assert_never(encoding_format) assert_never(encoding_format)
...@@ -129,6 +141,29 @@ class MetadataItem: ...@@ -129,6 +141,29 @@ class MetadataItem:
shape: tuple[int, ...] shape: tuple[int, ...]
def build_metadata_items(
embed_dtype: EmbedDType,
endianness: Endianness,
shape: tuple[int, ...],
n_request: int,
):
n_bytes = EMBED_DTYPE_TO_N_BYTES[embed_dtype]
size = math.prod(shape)
items = [
MetadataItem(
index=i,
embed_dtype=embed_dtype,
endianness=endianness,
start=i * size * n_bytes,
end=(i + 1) * size * n_bytes,
shape=shape,
)
for i in range(n_request)
]
return items
def encode_pooling_bytes( def encode_pooling_bytes(
pooling_outputs: list[PoolingRequestOutput], pooling_outputs: list[PoolingRequestOutput],
embed_dtype: EmbedDType, embed_dtype: EmbedDType,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment