Unverified Commit 27bebcd8 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Convert `examples` to `ruff-format` (#18400)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent e7523c2e
......@@ -45,8 +45,7 @@ if dist.get_rank() == 0:
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}\n")
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
print("-" * 50)
"""
Further tips:
......
......@@ -20,10 +20,12 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
def main():
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64,
max_num_seqs=4,
max_model_len=128)
llm = LLM(
model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64,
max_num_seqs=4,
max_model_len=128,
)
outputs = llm.generate(prompts, sampling_params)
print("-" * 50)
for output, answer in zip(outputs, answers):
......
......@@ -6,6 +6,7 @@ the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
from argparse import Namespace
from dataclasses import asdict
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
......@@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple):
def run_e5_v(query: Query) -> ModelRequestData:
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
if query["modality"] == "text":
text = query["text"]
prompt = llama3_template.format(
f"{text}\nSummary above sentence in one word: ")
prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
image = None
elif query["modality"] == "image":
prompt = llama3_template.format(
"<image>\nSummary above image in one word: ")
prompt = llama3_template.format("<image>\nSummary above image in one word: ")
image = query["image"]
else:
modality = query['modality']
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
......@@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
prompt = (
f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
)
image = query["image"]
else:
modality = query['modality']
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
......@@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
......@@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
if req_data.image is not None:
mm_data["image"] = req_data.image
outputs = llm.embed({
"prompt": req_data.prompt,
"multi_modal_data": mm_data,
})
outputs = llm.embed(
{
"prompt": req_data.prompt,
"multi_modal_data": mm_data,
}
)
print("-" * 50)
for output in outputs:
......@@ -164,23 +168,30 @@ model_example_map = {
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding')
parser.add_argument('--model-name',
'-m',
type=str,
default="vlm2vec",
choices=model_example_map.keys(),
help='The name of the embedding model.')
parser.add_argument('--modality',
type=str,
default="image",
choices=get_args(QueryModality),
help='Modality of the input.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
description="Demo on using vLLM for offline inference with "
"vision language models for multimodal embedding"
)
parser.add_argument(
"--model-name",
"-m",
type=str,
default="vlm2vec",
choices=model_example_map.keys(),
help="The name of the embedding model.",
)
parser.add_argument(
"--modality",
type=str,
default="image",
choices=get_args(QueryModality),
help="Modality of the input.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
......
......@@ -17,16 +17,15 @@ import requests
def clear_line(n: int = 1) -> None:
LINE_UP = '\033[1A'
LINE_CLEAR = '\x1b[2K'
LINE_UP = "\033[1A"
LINE_CLEAR = "\x1b[2K"
for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True)
def post_http_request(prompt: str,
api_url: str,
n: int = 1,
stream: bool = False) -> requests.Response:
def post_http_request(
prompt: str, api_url: str, n: int = 1, stream: bool = False
) -> requests.Response:
headers = {"User-Agent": "Test Client"}
pload = {
"prompt": prompt,
......@@ -35,17 +34,14 @@ def post_http_request(prompt: str,
"max_tokens": 16,
"stream": stream,
}
response = requests.post(api_url,
headers=headers,
json=pload,
stream=stream)
response = requests.post(api_url, headers=headers, json=pload, stream=stream)
return response
def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False,
delimiter=b"\n"):
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
):
if chunk:
data = json.loads(chunk.decode("utf-8"))
output = data["text"]
......
......@@ -6,6 +6,7 @@ Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base
"""
from typing import Union
import cohere
......@@ -16,28 +17,28 @@ model = "BAAI/bge-reranker-base"
query = "What is the capital of France?"
documents = [
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
"The capital of France is Paris",
"Reranking is fun!",
"vLLM is an open-source framework for fast AI serving",
]
def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
documents: list[str]) -> dict:
def cohere_rerank(
client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
) -> dict:
return client.rerank(model=model, query=query, documents=documents)
def main():
# cohere v1 client
cohere_v1 = cohere.Client(base_url="http://localhost:8000",
api_key="sk-fake-key")
cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
print("-" * 50)
print("rerank_v1_result:\n", rerank_v1_result)
print("-" * 50)
# or the v2
cohere_v2 = cohere.ClientV2("sk-fake-key",
base_url="http://localhost:8000")
cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
print("rerank_v2_result:\n", rerank_v2_result)
print("-" * 50)
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment