Unverified Commit 27bebcd8 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Convert `examples` to `ruff-format` (#18400)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent e7523c2e
...@@ -45,8 +45,7 @@ if dist.get_rank() == 0: ...@@ -45,8 +45,7 @@ if dist.get_rank() == 0:
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n" print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
f"Generated text: {generated_text!r}\n")
print("-" * 50) print("-" * 50)
""" """
Further tips: Further tips:
......
...@@ -20,10 +20,12 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) ...@@ -20,10 +20,12 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
def main(): def main():
# Set `enforce_eager=True` to avoid ahead-of-time compilation. # Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`. # In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", llm = LLM(
max_num_batched_tokens=64, model="Qwen/Qwen2-1.5B-Instruct",
max_num_seqs=4, max_num_batched_tokens=64,
max_model_len=128) max_num_seqs=4,
max_model_len=128,
)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
print("-" * 50) print("-" * 50)
for output, answer in zip(outputs, answers): for output, answer in zip(outputs, answers):
......
...@@ -6,6 +6,7 @@ the correct prompt format on vision language models for multimodal embedding. ...@@ -6,6 +6,7 @@ the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples For most models, the prompt format should follow corresponding examples
on HuggingFace model repository. on HuggingFace model repository.
""" """
from argparse import Namespace from argparse import Namespace
from dataclasses import asdict from dataclasses import asdict
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
...@@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple): ...@@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple):
def run_e5_v(query: Query) -> ModelRequestData: def run_e5_v(query: Query) -> ModelRequestData:
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
if query["modality"] == "text": if query["modality"] == "text":
text = query["text"] text = query["text"]
prompt = llama3_template.format( prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
f"{text}\nSummary above sentence in one word: ")
image = None image = None
elif query["modality"] == "image": elif query["modality"] == "image":
prompt = llama3_template.format( prompt = llama3_template.format("<image>\nSummary above image in one word: ")
"<image>\nSummary above image in one word: ")
image = query["image"] image = query["image"]
else: else:
modality = query['modality'] modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'") raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs( engine_args = EngineArgs(
...@@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData: ...@@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
image = query["image"] image = query["image"]
elif query["modality"] == "text+image": elif query["modality"] == "text+image":
text = query["text"] text = query["text"]
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 prompt = (
f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
)
image = query["image"] image = query["image"]
else: else:
modality = query['modality'] modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'") raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs( engine_args = EngineArgs(
...@@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): ...@@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
# Disable other modalities to save memory # Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0} default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict( req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {}) req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": seed} engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args) llm = LLM(**engine_args)
...@@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): ...@@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
if req_data.image is not None: if req_data.image is not None:
mm_data["image"] = req_data.image mm_data["image"] = req_data.image
outputs = llm.embed({ outputs = llm.embed(
"prompt": req_data.prompt, {
"multi_modal_data": mm_data, "prompt": req_data.prompt,
}) "multi_modal_data": mm_data,
}
)
print("-" * 50) print("-" * 50)
for output in outputs: for output in outputs:
...@@ -164,23 +168,30 @@ model_example_map = { ...@@ -164,23 +168,30 @@ model_example_map = {
def parse_args(): def parse_args():
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with ' description="Demo on using vLLM for offline inference with "
'vision language models for multimodal embedding') "vision language models for multimodal embedding"
parser.add_argument('--model-name', )
'-m', parser.add_argument(
type=str, "--model-name",
default="vlm2vec", "-m",
choices=model_example_map.keys(), type=str,
help='The name of the embedding model.') default="vlm2vec",
parser.add_argument('--modality', choices=model_example_map.keys(),
type=str, help="The name of the embedding model.",
default="image", )
choices=get_args(QueryModality), parser.add_argument(
help='Modality of the input.') "--modality",
parser.add_argument("--seed", type=str,
type=int, default="image",
default=None, choices=get_args(QueryModality),
help="Set the seed when initializing `vllm.LLM`.") help="Modality of the input.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args() return parser.parse_args()
......
...@@ -17,16 +17,15 @@ import requests ...@@ -17,16 +17,15 @@ import requests
def clear_line(n: int = 1) -> None: def clear_line(n: int = 1) -> None:
LINE_UP = '\033[1A' LINE_UP = "\033[1A"
LINE_CLEAR = '\x1b[2K' LINE_CLEAR = "\x1b[2K"
for _ in range(n): for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True) print(LINE_UP, end=LINE_CLEAR, flush=True)
def post_http_request(prompt: str, def post_http_request(
api_url: str, prompt: str, api_url: str, n: int = 1, stream: bool = False
n: int = 1, ) -> requests.Response:
stream: bool = False) -> requests.Response:
headers = {"User-Agent": "Test Client"} headers = {"User-Agent": "Test Client"}
pload = { pload = {
"prompt": prompt, "prompt": prompt,
...@@ -35,17 +34,14 @@ def post_http_request(prompt: str, ...@@ -35,17 +34,14 @@ def post_http_request(prompt: str,
"max_tokens": 16, "max_tokens": 16,
"stream": stream, "stream": stream,
} }
response = requests.post(api_url, response = requests.post(api_url, headers=headers, json=pload, stream=stream)
headers=headers,
json=pload,
stream=stream)
return response return response
def get_streaming_response(response: requests.Response) -> Iterable[list[str]]: def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(chunk_size=8192, for chunk in response.iter_lines(
decode_unicode=False, chunk_size=8192, decode_unicode=False, delimiter=b"\n"
delimiter=b"\n"): ):
if chunk: if chunk:
data = json.loads(chunk.decode("utf-8")) data = json.loads(chunk.decode("utf-8"))
output = data["text"] output = data["text"]
......
...@@ -6,6 +6,7 @@ Note that `pip install cohere` is needed to run this example. ...@@ -6,6 +6,7 @@ Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base run: vllm serve BAAI/bge-reranker-base
""" """
from typing import Union from typing import Union
import cohere import cohere
...@@ -16,28 +17,28 @@ model = "BAAI/bge-reranker-base" ...@@ -16,28 +17,28 @@ model = "BAAI/bge-reranker-base"
query = "What is the capital of France?" query = "What is the capital of France?"
documents = [ documents = [
"The capital of France is Paris", "Reranking is fun!", "The capital of France is Paris",
"vLLM is an open-source framework for fast AI serving" "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving",
] ]
def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str, def cohere_rerank(
documents: list[str]) -> dict: client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
) -> dict:
return client.rerank(model=model, query=query, documents=documents) return client.rerank(model=model, query=query, documents=documents)
def main(): def main():
# cohere v1 client # cohere v1 client
cohere_v1 = cohere.Client(base_url="http://localhost:8000", cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
api_key="sk-fake-key")
rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents) rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
print("-" * 50) print("-" * 50)
print("rerank_v1_result:\n", rerank_v1_result) print("rerank_v1_result:\n", rerank_v1_result)
print("-" * 50) print("-" * 50)
# or the v2 # or the v2
cohere_v2 = cohere.ClientV2("sk-fake-key", cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
base_url="http://localhost:8000")
rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents) rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
print("rerank_v2_result:\n", rerank_v2_result) print("rerank_v2_result:\n", rerank_v2_result)
print("-" * 50) print("-" * 50)
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment