Unverified Commit 27bebcd8 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Convert `examples` to `ruff-format` (#18400)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent e7523c2e
...@@ -45,8 +45,7 @@ if dist.get_rank() == 0: ...@@ -45,8 +45,7 @@ if dist.get_rank() == 0:
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n" print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
f"Generated text: {generated_text!r}\n")
print("-" * 50) print("-" * 50)
""" """
Further tips: Further tips:
......
...@@ -20,10 +20,12 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) ...@@ -20,10 +20,12 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
def main(): def main():
# Set `enforce_eager=True` to avoid ahead-of-time compilation. # Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`. # In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", llm = LLM(
model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64, max_num_batched_tokens=64,
max_num_seqs=4, max_num_seqs=4,
max_model_len=128) max_model_len=128,
)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
print("-" * 50) print("-" * 50)
for output, answer in zip(outputs, answers): for output, answer in zip(outputs, answers):
......
...@@ -6,6 +6,7 @@ the correct prompt format on vision language models for multimodal embedding. ...@@ -6,6 +6,7 @@ the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples For most models, the prompt format should follow corresponding examples
on HuggingFace model repository. on HuggingFace model repository.
""" """
from argparse import Namespace from argparse import Namespace
from dataclasses import asdict from dataclasses import asdict
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
...@@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple): ...@@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple):
def run_e5_v(query: Query) -> ModelRequestData: def run_e5_v(query: Query) -> ModelRequestData:
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
if query["modality"] == "text": if query["modality"] == "text":
text = query["text"] text = query["text"]
prompt = llama3_template.format( prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
f"{text}\nSummary above sentence in one word: ")
image = None image = None
elif query["modality"] == "image": elif query["modality"] == "image":
prompt = llama3_template.format( prompt = llama3_template.format("<image>\nSummary above image in one word: ")
"<image>\nSummary above image in one word: ")
image = query["image"] image = query["image"]
else: else:
modality = query['modality'] modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'") raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs( engine_args = EngineArgs(
...@@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData: ...@@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
image = query["image"] image = query["image"]
elif query["modality"] == "text+image": elif query["modality"] == "text+image":
text = query["text"] text = query["text"]
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 prompt = (
f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
)
image = query["image"] image = query["image"]
else: else:
modality = query['modality'] modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'") raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs( engine_args = EngineArgs(
...@@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): ...@@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
# Disable other modalities to save memory # Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0} default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict( req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {}) req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": seed} engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args) llm = LLM(**engine_args)
...@@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): ...@@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
if req_data.image is not None: if req_data.image is not None:
mm_data["image"] = req_data.image mm_data["image"] = req_data.image
outputs = llm.embed({ outputs = llm.embed(
{
"prompt": req_data.prompt, "prompt": req_data.prompt,
"multi_modal_data": mm_data, "multi_modal_data": mm_data,
}) }
)
print("-" * 50) print("-" * 50)
for output in outputs: for output in outputs:
...@@ -164,23 +168,30 @@ model_example_map = { ...@@ -164,23 +168,30 @@ model_example_map = {
def parse_args(): def parse_args():
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with ' description="Demo on using vLLM for offline inference with "
'vision language models for multimodal embedding') "vision language models for multimodal embedding"
parser.add_argument('--model-name', )
'-m', parser.add_argument(
"--model-name",
"-m",
type=str, type=str,
default="vlm2vec", default="vlm2vec",
choices=model_example_map.keys(), choices=model_example_map.keys(),
help='The name of the embedding model.') help="The name of the embedding model.",
parser.add_argument('--modality', )
parser.add_argument(
"--modality",
type=str, type=str,
default="image", default="image",
choices=get_args(QueryModality), choices=get_args(QueryModality),
help='Modality of the input.') help="Modality of the input.",
parser.add_argument("--seed", )
parser.add_argument(
"--seed",
type=int, type=int,
default=None, default=None,
help="Set the seed when initializing `vllm.LLM`.") help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args() return parser.parse_args()
......
...@@ -17,16 +17,15 @@ import requests ...@@ -17,16 +17,15 @@ import requests
def clear_line(n: int = 1) -> None: def clear_line(n: int = 1) -> None:
LINE_UP = '\033[1A' LINE_UP = "\033[1A"
LINE_CLEAR = '\x1b[2K' LINE_CLEAR = "\x1b[2K"
for _ in range(n): for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True) print(LINE_UP, end=LINE_CLEAR, flush=True)
def post_http_request(prompt: str, def post_http_request(
api_url: str, prompt: str, api_url: str, n: int = 1, stream: bool = False
n: int = 1, ) -> requests.Response:
stream: bool = False) -> requests.Response:
headers = {"User-Agent": "Test Client"} headers = {"User-Agent": "Test Client"}
pload = { pload = {
"prompt": prompt, "prompt": prompt,
...@@ -35,17 +34,14 @@ def post_http_request(prompt: str, ...@@ -35,17 +34,14 @@ def post_http_request(prompt: str,
"max_tokens": 16, "max_tokens": 16,
"stream": stream, "stream": stream,
} }
response = requests.post(api_url, response = requests.post(api_url, headers=headers, json=pload, stream=stream)
headers=headers,
json=pload,
stream=stream)
return response return response
def get_streaming_response(response: requests.Response) -> Iterable[list[str]]: def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(chunk_size=8192, for chunk in response.iter_lines(
decode_unicode=False, chunk_size=8192, decode_unicode=False, delimiter=b"\n"
delimiter=b"\n"): ):
if chunk: if chunk:
data = json.loads(chunk.decode("utf-8")) data = json.loads(chunk.decode("utf-8"))
output = data["text"] output = data["text"]
......
...@@ -6,6 +6,7 @@ Note that `pip install cohere` is needed to run this example. ...@@ -6,6 +6,7 @@ Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base run: vllm serve BAAI/bge-reranker-base
""" """
from typing import Union from typing import Union
import cohere import cohere
...@@ -16,28 +17,28 @@ model = "BAAI/bge-reranker-base" ...@@ -16,28 +17,28 @@ model = "BAAI/bge-reranker-base"
query = "What is the capital of France?" query = "What is the capital of France?"
documents = [ documents = [
"The capital of France is Paris", "Reranking is fun!", "The capital of France is Paris",
"vLLM is an open-source framework for fast AI serving" "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving",
] ]
def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str, def cohere_rerank(
documents: list[str]) -> dict: client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
) -> dict:
return client.rerank(model=model, query=query, documents=documents) return client.rerank(model=model, query=query, documents=documents)
def main(): def main():
# cohere v1 client # cohere v1 client
cohere_v1 = cohere.Client(base_url="http://localhost:8000", cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
api_key="sk-fake-key")
rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents) rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
print("-" * 50) print("-" * 50)
print("rerank_v1_result:\n", rerank_v1_result) print("rerank_v1_result:\n", rerank_v1_result)
print("-" * 50) print("-" * 50)
# or the v2 # or the v2
cohere_v2 = cohere.ClientV2("sk-fake-key", cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
base_url="http://localhost:8000")
rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents) rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
print("rerank_v2_result:\n", rerank_v2_result) print("rerank_v2_result:\n", rerank_v2_result)
print("-" * 50) print("-" * 50)
......
...@@ -17,6 +17,7 @@ you can install it manually by following these steps: ...@@ -17,6 +17,7 @@ you can install it manually by following these steps:
2. Rename the downloaded file to: frpc_linux_amd64_v0.3 2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
""" """
import argparse import argparse
import gradio as gr import gradio as gr
...@@ -24,16 +25,12 @@ from openai import OpenAI ...@@ -24,16 +25,12 @@ from openai import OpenAI
def format_history_to_openai(history): def format_history_to_openai(history):
history_openai_format = [{ history_openai_format = [
"role": "system", {"role": "system", "content": "You are a great AI assistant."}
"content": "You are a great AI assistant." ]
}]
for human, assistant in history: for human, assistant in history:
history_openai_format.append({"role": "user", "content": human}) history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({ history_openai_format.append({"role": "assistant", "content": assistant})
"role": "assistant",
"content": assistant
})
return history_openai_format return history_openai_format
...@@ -49,17 +46,17 @@ def predict(message, history, client, model_name, temp, stop_token_ids): ...@@ -49,17 +46,17 @@ def predict(message, history, client, model_name, temp, stop_token_ids):
temperature=temp, temperature=temp,
stream=True, stream=True,
extra_body={ extra_body={
'repetition_penalty': "repetition_penalty": 1,
1, "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
'stop_token_ids': if stop_token_ids
[int(id.strip()) else [],
for id in stop_token_ids.split(',')] if stop_token_ids else [] },
}) )
# Collect all chunks and concatenate them into a full message # Collect all chunks and concatenate them into a full message
full_message = "" full_message = ""
for chunk in stream: for chunk in stream:
full_message += (chunk.choices[0].delta.content or "") full_message += chunk.choices[0].delta.content or ""
# Return the full message as a single response # Return the full message as a single response
return full_message return full_message
...@@ -67,38 +64,34 @@ def predict(message, history, client, model_name, temp, stop_token_ids): ...@@ -67,38 +64,34 @@ def predict(message, history, client, model_name, temp, stop_token_ids):
def parse_args(): def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Chatbot Interface with Customizable Parameters') description="Chatbot Interface with Customizable Parameters"
parser.add_argument('--model-url', )
type=str, parser.add_argument(
default='http://localhost:8000/v1', "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
help='Model URL') )
parser.add_argument('-m', parser.add_argument(
'--model', "-m", "--model", type=str, required=True, help="Model name for the chatbot"
type=str, )
required=True, parser.add_argument(
help='Model name for the chatbot') "--temp", type=float, default=0.8, help="Temperature for text generation"
parser.add_argument('--temp', )
type=float, parser.add_argument(
default=0.8, "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
help='Temperature for text generation') )
parser.add_argument('--stop-token-ids',
type=str,
default='',
help='Comma-separated stop token IDs')
parser.add_argument("--host", type=str, default=None) parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001) parser.add_argument("--port", type=int, default=8001)
return parser.parse_args() return parser.parse_args()
def build_gradio_interface(client, model_name, temp, stop_token_ids): def build_gradio_interface(client, model_name, temp, stop_token_ids):
def chat_predict(message, history): def chat_predict(message, history):
return predict(message, history, client, model_name, temp, return predict(message, history, client, model_name, temp, stop_token_ids)
stop_token_ids)
return gr.ChatInterface(fn=chat_predict, return gr.ChatInterface(
fn=chat_predict,
title="Chatbot Interface", title="Chatbot Interface",
description="A simple chatbot powered by vLLM") description="A simple chatbot powered by vLLM",
)
def main(): def main():
...@@ -113,12 +106,13 @@ def main(): ...@@ -113,12 +106,13 @@ def main():
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
# Define the Gradio chatbot interface using the predict function # Define the Gradio chatbot interface using the predict function
gradio_interface = build_gradio_interface(client, args.model, args.temp, gradio_interface = build_gradio_interface(
args.stop_token_ids) client, args.model, args.temp, args.stop_token_ids
)
gradio_interface.queue().launch(server_name=args.host, gradio_interface.queue().launch(
server_port=args.port, server_name=args.host, server_port=args.port, share=True
share=True) )
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -17,6 +17,7 @@ you can install it manually by following these steps: ...@@ -17,6 +17,7 @@ you can install it manually by following these steps:
2. Rename the downloaded file to: frpc_linux_amd64_v0.3 2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
""" """
import argparse import argparse
import json import json
...@@ -31,14 +32,11 @@ def http_bot(prompt): ...@@ -31,14 +32,11 @@ def http_bot(prompt):
"stream": True, "stream": True,
"max_tokens": 128, "max_tokens": 128,
} }
response = requests.post(args.model_url, response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
headers=headers,
json=pload, for chunk in response.iter_lines(
stream=True) chunk_size=8192, decode_unicode=False, delimiter=b"\n"
):
for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False,
delimiter=b"\n"):
if chunk: if chunk:
data = json.loads(chunk.decode("utf-8")) data = json.loads(chunk.decode("utf-8"))
output = data["text"][0] output = data["text"][0]
...@@ -48,10 +46,10 @@ def http_bot(prompt): ...@@ -48,10 +46,10 @@ def http_bot(prompt):
def build_demo(): def build_demo():
with gr.Blocks() as demo: with gr.Blocks() as demo:
gr.Markdown("# vLLM text completion demo\n") gr.Markdown("# vLLM text completion demo\n")
inputbox = gr.Textbox(label="Input", inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
placeholder="Enter text and press ENTER") outputbox = gr.Textbox(
outputbox = gr.Textbox(label="Output", label="Output", placeholder="Generated result from the model"
placeholder="Generated result from the model") )
inputbox.submit(http_bot, [inputbox], [outputbox]) inputbox.submit(http_bot, [inputbox], [outputbox])
return demo return demo
...@@ -60,17 +58,15 @@ def parse_args(): ...@@ -60,17 +58,15 @@ def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default=None) parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001) parser.add_argument("--port", type=int, default=8001)
parser.add_argument("--model-url", parser.add_argument(
type=str, "--model-url", type=str, default="http://localhost:8000/generate"
default="http://localhost:8000/generate") )
return parser.parse_args() return parser.parse_args()
def main(args): def main(args):
demo = build_demo() demo = build_demo()
demo.queue().launch(server_name=args.host, demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
server_port=args.port,
share=True)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -5,6 +5,7 @@ Jina and Cohere https://jina.ai/reranker ...@@ -5,6 +5,7 @@ Jina and Cohere https://jina.ai/reranker
run: vllm serve BAAI/bge-reranker-base run: vllm serve BAAI/bge-reranker-base
""" """
import json import json
import requests import requests
...@@ -14,14 +15,13 @@ url = "http://127.0.0.1:8000/rerank" ...@@ -14,14 +15,13 @@ url = "http://127.0.0.1:8000/rerank"
headers = {"accept": "application/json", "Content-Type": "application/json"} headers = {"accept": "application/json", "Content-Type": "application/json"}
data = { data = {
"model": "model": "BAAI/bge-reranker-base",
"BAAI/bge-reranker-base", "query": "What is the capital of France?",
"query":
"What is the capital of France?",
"documents": [ "documents": [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris.", "Horses and cows are both animals" "The capital of France is Paris.",
] "Horses and cows are both animals",
],
} }
......
...@@ -9,17 +9,14 @@ from msgspec.msgpack import Decoder ...@@ -9,17 +9,14 @@ from msgspec.msgpack import Decoder
# #
# Types copied from vllm.distributed.kv_events # Types copied from vllm.distributed.kv_events
# #
class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
gc=False):
ts: float ts: float
events: list[Any] events: list[Any]
class KVCacheEvent(msgspec.Struct, class KVCacheEvent(
array_like=True, msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
omit_defaults=True, ):
gc=False,
tag=True):
"""Base class for all KV cache-related events""" """Base class for all KV cache-related events"""
...@@ -77,8 +74,9 @@ def main(): ...@@ -77,8 +74,9 @@ def main():
if last_seq >= 0 and seq > last_seq + 1: if last_seq >= 0 and seq > last_seq + 1:
missed = seq - last_seq - 1 missed = seq - last_seq - 1
print(f"Missed {missed} messages" print(
f" (last: {last_seq}, current: {seq})") f"Missed {missed} messages (last: {last_seq}, current: {seq})"
)
replay.send((last_seq + 1).to_bytes(8, "big")) replay.send((last_seq + 1).to_bytes(8, "big"))
......
...@@ -12,26 +12,22 @@ from openai import OpenAI ...@@ -12,26 +12,22 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
messages = [{ messages = [
"role": "system", {"role": "system", "content": "You are a helpful assistant."},
"content": "You are a helpful assistant." {"role": "user", "content": "Who won the world series in 2020?"},
}, { {
"role": "user",
"content": "Who won the world series in 2020?"
}, {
"role": "assistant", "role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020." "content": "The Los Angeles Dodgers won the World Series in 2020.",
}, { },
"role": "user", {"role": "user", "content": "Where was it played?"},
"content": "Where was it played?" ]
}]
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="Client for vLLM API server") parser = argparse.ArgumentParser(description="Client for vLLM API server")
parser.add_argument("--stream", parser.add_argument(
action="store_true", "--stream", action="store_true", help="Enable streaming response"
help="Enable streaming response") )
return parser.parse_args() return parser.parse_args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment