Unverified Commit 27bebcd8 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Convert `examples` to `ruff-format` (#18400)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent e7523c2e
......@@ -45,8 +45,7 @@ if dist.get_rank() == 0:
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}\n")
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
print("-" * 50)
"""
Further tips:
......
......@@ -20,10 +20,12 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
def main():
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
llm = LLM(
model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64,
max_num_seqs=4,
max_model_len=128)
max_model_len=128,
)
outputs = llm.generate(prompts, sampling_params)
print("-" * 50)
for output, answer in zip(outputs, answers):
......
......@@ -6,6 +6,7 @@ the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
from argparse import Namespace
from dataclasses import asdict
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
......@@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple):
def run_e5_v(query: Query) -> ModelRequestData:
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
if query["modality"] == "text":
text = query["text"]
prompt = llama3_template.format(
f"{text}\nSummary above sentence in one word: ")
prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
image = None
elif query["modality"] == "image":
prompt = llama3_template.format(
"<image>\nSummary above image in one word: ")
prompt = llama3_template.format("<image>\nSummary above image in one word: ")
image = query["image"]
else:
modality = query['modality']
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
......@@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
prompt = (
f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
)
image = query["image"]
else:
modality = query['modality']
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
......@@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
......@@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
if req_data.image is not None:
mm_data["image"] = req_data.image
outputs = llm.embed({
outputs = llm.embed(
{
"prompt": req_data.prompt,
"multi_modal_data": mm_data,
})
}
)
print("-" * 50)
for output in outputs:
......@@ -164,23 +168,30 @@ model_example_map = {
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding')
parser.add_argument('--model-name',
'-m',
description="Demo on using vLLM for offline inference with "
"vision language models for multimodal embedding"
)
parser.add_argument(
"--model-name",
"-m",
type=str,
default="vlm2vec",
choices=model_example_map.keys(),
help='The name of the embedding model.')
parser.add_argument('--modality',
help="The name of the embedding model.",
)
parser.add_argument(
"--modality",
type=str,
default="image",
choices=get_args(QueryModality),
help='Modality of the input.')
parser.add_argument("--seed",
help="Modality of the input.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
......
......@@ -17,16 +17,15 @@ import requests
def clear_line(n: int = 1) -> None:
LINE_UP = '\033[1A'
LINE_CLEAR = '\x1b[2K'
LINE_UP = "\033[1A"
LINE_CLEAR = "\x1b[2K"
for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True)
def post_http_request(prompt: str,
api_url: str,
n: int = 1,
stream: bool = False) -> requests.Response:
def post_http_request(
prompt: str, api_url: str, n: int = 1, stream: bool = False
) -> requests.Response:
headers = {"User-Agent": "Test Client"}
pload = {
"prompt": prompt,
......@@ -35,17 +34,14 @@ def post_http_request(prompt: str,
"max_tokens": 16,
"stream": stream,
}
response = requests.post(api_url,
headers=headers,
json=pload,
stream=stream)
response = requests.post(api_url, headers=headers, json=pload, stream=stream)
return response
def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False,
delimiter=b"\n"):
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
):
if chunk:
data = json.loads(chunk.decode("utf-8"))
output = data["text"]
......
......@@ -6,6 +6,7 @@ Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base
"""
from typing import Union
import cohere
......@@ -16,28 +17,28 @@ model = "BAAI/bge-reranker-base"
query = "What is the capital of France?"
documents = [
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
"The capital of France is Paris",
"Reranking is fun!",
"vLLM is an open-source framework for fast AI serving",
]
def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
documents: list[str]) -> dict:
def cohere_rerank(
client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
) -> dict:
return client.rerank(model=model, query=query, documents=documents)
def main():
# cohere v1 client
cohere_v1 = cohere.Client(base_url="http://localhost:8000",
api_key="sk-fake-key")
cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
print("-" * 50)
print("rerank_v1_result:\n", rerank_v1_result)
print("-" * 50)
# or the v2
cohere_v2 = cohere.ClientV2("sk-fake-key",
base_url="http://localhost:8000")
cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
print("rerank_v2_result:\n", rerank_v2_result)
print("-" * 50)
......
......@@ -17,6 +17,7 @@ you can install it manually by following these steps:
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import argparse
import gradio as gr
......@@ -24,16 +25,12 @@ from openai import OpenAI
def format_history_to_openai(history):
history_openai_format = [{
"role": "system",
"content": "You are a great AI assistant."
}]
history_openai_format = [
{"role": "system", "content": "You are a great AI assistant."}
]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({
"role": "assistant",
"content": assistant
})
history_openai_format.append({"role": "assistant", "content": assistant})
return history_openai_format
......@@ -49,17 +46,17 @@ def predict(message, history, client, model_name, temp, stop_token_ids):
temperature=temp,
stream=True,
extra_body={
'repetition_penalty':
1,
'stop_token_ids':
[int(id.strip())
for id in stop_token_ids.split(',')] if stop_token_ids else []
})
"repetition_penalty": 1,
"stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
if stop_token_ids
else [],
},
)
# Collect all chunks and concatenate them into a full message
full_message = ""
for chunk in stream:
full_message += (chunk.choices[0].delta.content or "")
full_message += chunk.choices[0].delta.content or ""
# Return the full message as a single response
return full_message
......@@ -67,38 +64,34 @@ def predict(message, history, client, model_name, temp, stop_token_ids):
def parse_args():
parser = argparse.ArgumentParser(
description='Chatbot Interface with Customizable Parameters')
parser.add_argument('--model-url',
type=str,
default='http://localhost:8000/v1',
help='Model URL')
parser.add_argument('-m',
'--model',
type=str,
required=True,
help='Model name for the chatbot')
parser.add_argument('--temp',
type=float,
default=0.8,
help='Temperature for text generation')
parser.add_argument('--stop-token-ids',
type=str,
default='',
help='Comma-separated stop token IDs')
description="Chatbot Interface with Customizable Parameters"
)
parser.add_argument(
"--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
)
parser.add_argument(
"-m", "--model", type=str, required=True, help="Model name for the chatbot"
)
parser.add_argument(
"--temp", type=float, default=0.8, help="Temperature for text generation"
)
parser.add_argument(
"--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
)
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
return parser.parse_args()
def build_gradio_interface(client, model_name, temp, stop_token_ids):
def chat_predict(message, history):
return predict(message, history, client, model_name, temp,
stop_token_ids)
return predict(message, history, client, model_name, temp, stop_token_ids)
return gr.ChatInterface(fn=chat_predict,
return gr.ChatInterface(
fn=chat_predict,
title="Chatbot Interface",
description="A simple chatbot powered by vLLM")
description="A simple chatbot powered by vLLM",
)
def main():
......@@ -113,12 +106,13 @@ def main():
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
# Define the Gradio chatbot interface using the predict function
gradio_interface = build_gradio_interface(client, args.model, args.temp,
args.stop_token_ids)
gradio_interface = build_gradio_interface(
client, args.model, args.temp, args.stop_token_ids
)
gradio_interface.queue().launch(server_name=args.host,
server_port=args.port,
share=True)
gradio_interface.queue().launch(
server_name=args.host, server_port=args.port, share=True
)
if __name__ == "__main__":
......
......@@ -17,6 +17,7 @@ you can install it manually by following these steps:
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import argparse
import json
......@@ -31,14 +32,11 @@ def http_bot(prompt):
"stream": True,
"max_tokens": 128,
}
response = requests.post(args.model_url,
headers=headers,
json=pload,
stream=True)
for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False,
delimiter=b"\n"):
response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
):
if chunk:
data = json.loads(chunk.decode("utf-8"))
output = data["text"][0]
......@@ -48,10 +46,10 @@ def http_bot(prompt):
def build_demo():
with gr.Blocks() as demo:
gr.Markdown("# vLLM text completion demo\n")
inputbox = gr.Textbox(label="Input",
placeholder="Enter text and press ENTER")
outputbox = gr.Textbox(label="Output",
placeholder="Generated result from the model")
inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
outputbox = gr.Textbox(
label="Output", placeholder="Generated result from the model"
)
inputbox.submit(http_bot, [inputbox], [outputbox])
return demo
......@@ -60,17 +58,15 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
parser.add_argument("--model-url",
type=str,
default="http://localhost:8000/generate")
parser.add_argument(
"--model-url", type=str, default="http://localhost:8000/generate"
)
return parser.parse_args()
def main(args):
demo = build_demo()
demo.queue().launch(server_name=args.host,
server_port=args.port,
share=True)
demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
if __name__ == "__main__":
......
......@@ -5,6 +5,7 @@ Jina and Cohere https://jina.ai/reranker
run: vllm serve BAAI/bge-reranker-base
"""
import json
import requests
......@@ -14,14 +15,13 @@ url = "http://127.0.0.1:8000/rerank"
headers = {"accept": "application/json", "Content-Type": "application/json"}
data = {
"model":
"BAAI/bge-reranker-base",
"query":
"What is the capital of France?",
"model": "BAAI/bge-reranker-base",
"query": "What is the capital of France?",
"documents": [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.", "Horses and cows are both animals"
]
"The capital of France is Paris.",
"Horses and cows are both animals",
],
}
......
......@@ -9,17 +9,14 @@ from msgspec.msgpack import Decoder
#
# Types copied from vllm.distributed.kv_events
#
class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True,
gc=False):
class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
ts: float
events: list[Any]
class KVCacheEvent(msgspec.Struct,
array_like=True,
omit_defaults=True,
gc=False,
tag=True):
class KVCacheEvent(
msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
):
"""Base class for all KV cache-related events"""
......@@ -77,8 +74,9 @@ def main():
if last_seq >= 0 and seq > last_seq + 1:
missed = seq - last_seq - 1
print(f"Missed {missed} messages"
f" (last: {last_seq}, current: {seq})")
print(
f"Missed {missed} messages (last: {last_seq}, current: {seq})"
)
replay.send((last_seq + 1).to_bytes(8, "big"))
......
......@@ -12,26 +12,22 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "Who won the world series in 2020?"
}, {
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"},
{
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020."
}, {
"role": "user",
"content": "Where was it played?"
}]
"content": "The Los Angeles Dodgers won the World Series in 2020.",
},
{"role": "user", "content": "Where was it played?"},
]
def parse_args():
parser = argparse.ArgumentParser(description="Client for vLLM API server")
parser.add_argument("--stream",
action="store_true",
help="Enable streaming response")
parser.add_argument(
"--stream", action="store_true", help="Enable streaming response"
)
return parser.parse_args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment