Commit 8d75f22e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

parents ce888aa4 7d80c73d
# Pooling models
## Cohere rerank usage
```bash
# vllm serve BAAI/bge-reranker-base
python examples/online_serving/pooling/cohere_rerank_client.py
```
## Embedding requests base64 encoding_format usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/embedding_requests_base64_client.py
```
## Embedding requests bytes encoding_format usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/embedding_requests_bytes_client.py
```
## Jinaai rerank usage
```bash
# vllm serve BAAI/bge-reranker-base
python examples/online_serving/pooling/jinaai_rerank_client.py
```
## Multi vector retrieval usage
```bash
# vllm serve BAAI/bge-m3
python examples/online_serving/pooling/multi_vector_retrieval_client.py
```
## Named Entity Recognition (NER) usage
```bash
# vllm serve boltuix/NeuroBERT-NER
python examples/online_serving/pooling/ner_client.py
```
## OpenAI chat embedding for multimodal usage
```bash
python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
```
## OpenAI classification usage
```bash
# vllm serve jason9693/Qwen2.5-1.5B-apeach
python examples/online_serving/pooling/openai_classification_client.py
```
## OpenAI cross_encoder score usage
```bash
# vllm serve BAAI/bge-reranker-v2-m3
python examples/online_serving/pooling/openai_cross_encoder_score.py
```
## OpenAI cross_encoder score for multimodal usage
```bash
# vllm serve jinaai/jina-reranker-m0
python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
```
## OpenAI embedding usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/openai_embedding_client.py
```
## OpenAI embedding matryoshka dimensions usage
```bash
# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
```
## OpenAI pooling usage
```bash
# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
python examples/online_serving/pooling/openai_pooling_client.py
```
## Online Prithvi Geospatial MAE usage
```bash
python examples/online_serving/pooling/prithvi_geospatial_mae.py
```
......@@ -28,13 +28,11 @@ Dependencies:
- openai
"""
import base64
import io
import torch
import transformers
from openai import OpenAI
from vllm.utils.serial_utils import tensor2base64
def main():
client = OpenAI(
......@@ -58,11 +56,7 @@ def main():
prompt_embeds = embedding_layer(token_ids).squeeze(0)
# Prompt embeddings
buffer = io.BytesIO()
torch.save(prompt_embeds, buffer)
buffer.seek(0)
binary_data = buffer.read()
encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
encoded_embeds = tensor2base64(prompt_embeds)
completion = client.completions.create(
model=model_name,
......
......@@ -26,9 +26,21 @@ async def lifespan(app: FastAPI):
)
app.state.prefill_client = httpx.AsyncClient(
timeout=None, base_url=prefiller_base_url
timeout=None,
base_url=prefiller_base_url,
limits=httpx.Limits(
max_connections=None,
max_keepalive_connections=None,
),
)
app.state.decode_client = httpx.AsyncClient(
timeout=None,
base_url=decoder_base_url,
limits=httpx.Limits(
max_connections=None,
max_keepalive_connections=None,
),
)
app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)
yield
......@@ -105,6 +117,11 @@ async def send_request_to_service(
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
response = await client.post(endpoint, json=req_data, headers=headers)
response.raise_for_status()
# read/consume the response body to release the connection
# otherwise, it would http.ReadError
await response.aread()
return response
......
......@@ -16,6 +16,7 @@ from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
MetadataItem,
build_metadata_items,
decode_pooling_output,
)
......@@ -38,6 +39,11 @@ def parse_args():
def main(args):
api_url = f"http://{args.host}:{args.port}/v1/embeddings"
model_name = args.model
embedding_size = 0
input_texts = [
"The best thing about vLLM is that it supports many different models",
] * 2
# The OpenAI client does not support the bytes encoding_format.
# The OpenAI client does not support the embed_dtype and endianness parameters.
......@@ -45,7 +51,7 @@ def main(args):
for endianness in ENDIANNESS:
prompt = {
"model": model_name,
"input": "vLLM is great!",
"input": input_texts,
"encoding_format": "bytes",
"embed_dtype": embed_dtype,
"endianness": endianness,
......@@ -57,7 +63,34 @@ def main(args):
embedding = decode_pooling_output(items=items, body=body)
embedding = [x.to(torch.float32) for x in embedding]
embedding = torch.cat(embedding)
embedding = torch.stack(embedding)
embedding_size = embedding.shape[-1]
print(embed_dtype, endianness, embedding.shape)
# The vllm server always sorts the returned embeddings in the order of input. So
# returning metadata is not necessary. You can set encoding_format to bytes_only
# to let the server not return metadata.
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
prompt = {
"model": model_name,
"input": input_texts,
"encoding_format": "bytes_only",
"embed_dtype": embed_dtype,
"endianness": endianness,
}
response = post_http_request(prompt=prompt, api_url=api_url)
body = response.content
items = build_metadata_items(
embed_dtype=embed_dtype,
endianness=endianness,
shape=(embedding_size,),
n_request=len(input_texts),
)
embedding = decode_pooling_output(items=items, body=body)
embedding = [x.to(torch.float32) for x in embedding]
embedding = torch.stack(embedding)
print(embed_dtype, endianness, embedding.shape)
......
......@@ -150,7 +150,8 @@ def run_siglip(client: OpenAI, model: str):
Start the server using:
vllm serve google/siglip-base-patch16-224 \
--runner pooling
--runner pooling \
--chat-template template_basic.jinja
"""
response = create_chat_embeddings(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment