[Benchmark] Support Infinity API (#26641)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Benchmark] Support Infinity API (#26641)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
5be7ca1b · Cyrus Leung · GitHub · f0a30a06 · 5be7ca1b · 5be7ca1b
Unverified Commit 5be7ca1b authored Oct 12, 2025 by Cyrus Leung Committed by GitHub Oct 12, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 96 additions and 29 deletions

vllm/benchmarks/datasets.py vllm/benchmarks/datasets.py +1 -1

vllm/benchmarks/lib/endpoint_request_func.py vllm/benchmarks/lib/endpoint_request_func.py +95 -28

No files found.
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1584,7 +1584,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:

        if dataset_class.IS_MULTIMODAL and not (
            args.backend in ("openai-chat", "openai-audio")
-            or "openai-embeddings-" in args.backend
+            or "embeddings-" in args.backend
        ):
            # multi-modal benchmark is only available on OpenAI Chat
            # endpoint-type.

--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -581,29 +581,6 @@ async def async_request_openai_embeddings_chat(
    )


-async def async_request_openai_embeddings_clip(
-    request_func_input: RequestFuncInput,
-    session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    if request_func_input.multi_modal_content:
-        # Image input
-        request_func_input.prompt = ""
-
-    # max_model_len=77 is too short for most datasets,
-    # so by default we truncate the prompt to max_model_len
-    if request_func_input.extra_body is None:
-        request_func_input.extra_body = {}
-    if "truncate_prompt_tokens" not in request_func_input.extra_body:
-        request_func_input.extra_body["truncate_prompt_tokens"] = -1
-
-    return await async_request_openai_embeddings_chat(
-        request_func_input,
-        session,
-        pbar=pbar,
-    )
-
-
 def _try_extract_request_idx(request_func_input: RequestFuncInput):
    if request_func_input.request_id:
        match = re.search(r"(\d+)$", request_func_input.request_id)
@@ -616,11 +593,20 @@ def _try_extract_request_idx(request_func_input: RequestFuncInput):
    return None


-async def async_request_openai_embeddings_vlm2vec(
-    request_func_input: RequestFuncInput,
-    session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
+def _preprocess_clip(request_func_input: RequestFuncInput):
+    if request_func_input.multi_modal_content:
+        # Image input
+        request_func_input.prompt = ""
+
+    # max_model_len=77 is too short for most datasets,
+    # so by default we truncate the prompt to max_model_len
+    if request_func_input.extra_body is None:
+        request_func_input.extra_body = {}
+    if "truncate_prompt_tokens" not in request_func_input.extra_body:
+        request_func_input.extra_body["truncate_prompt_tokens"] = -1
+
+
+def _preprocess_vlm2vec(request_func_input: RequestFuncInput):
    if request_func_input.multi_modal_content:
        request_idx = _try_extract_request_idx(request_func_input)

@@ -637,6 +623,28 @@ async def async_request_openai_embeddings_vlm2vec(
                f"{request_func_input.prompt}"
            )

+
+async def async_request_openai_embeddings_clip(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    _preprocess_clip(request_func_input)
+
+    return await async_request_openai_embeddings_chat(
+        request_func_input,
+        session,
+        pbar=pbar,
+    )
+
+
+async def async_request_openai_embeddings_vlm2vec(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    _preprocess_vlm2vec(request_func_input)
+
    return await async_request_openai_embeddings_chat(
        request_func_input,
        session,
@@ -645,6 +653,61 @@ async def async_request_openai_embeddings_vlm2vec(
    )


+async def async_request_infinity_embeddings(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    _validate_api_url(api_url, "Infinity Embeddings API", "embeddings")
+
+    payload = {
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
+    }
+
+    if request_func_input.prompt:
+        payload["input"] = request_func_input.prompt
+    else:
+        mm_content = request_func_input.multi_modal_content
+        assert isinstance(mm_content, dict)
+
+        mm_type = mm_content["type"]
+        payload["input"] = mm_content[mm_type]["url"]
+        payload["modality"] = mm_type.split("_", 1)[0]
+
+    _update_payload_common(payload, request_func_input)
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+    _update_headers_common(headers, request_func_input)
+
+    return await _run_openai_embeddings(
+        session,
+        api_url,
+        payload=payload,
+        headers=headers,
+        pbar=pbar,
+    )
+
+
+async def async_request_infinity_embeddings_clip(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    _preprocess_clip(request_func_input)
+
+    return await async_request_infinity_embeddings(
+        request_func_input,
+        session,
+        pbar=pbar,
+    )
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
    "vllm": async_request_openai_completions,
@@ -655,6 +718,10 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
    "openai-embeddings-chat": async_request_openai_embeddings_chat,
    "openai-embeddings-clip": async_request_openai_embeddings_clip,
    "openai-embeddings-vlm2vec": async_request_openai_embeddings_vlm2vec,
+    # Infinity embedding server: https://github.com/michaelfeil/infinity
+    "infinity-embeddings": async_request_infinity_embeddings,
+    "infinity-embeddings-clip": async_request_infinity_embeddings_clip,
+    # (Infinity embedding server does not support vlm2vec)
 }

 OPENAI_COMPATIBLE_BACKENDS = [