Unverified Commit 71689204 authored by Reid's avatar Reid Committed by GitHub
Browse files

[Misc] refactor examples series (#16708)


Signed-off-by: default avatarreidliu41 <reid201711@gmail.com>
Co-authored-by: default avatarreidliu41 <reid201711@gmail.com>
parent 21378a23
...@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine: ...@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
return LLMEngine.from_engine_args(engine_args) return LLMEngine.from_engine_args(engine_args)
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
return parser.parse_args()
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
"""Main function that sets up and runs the prompt processing.""" """Main function that sets up and runs the prompt processing."""
engine = initialize_engine(args) engine = initialize_engine(args)
...@@ -58,8 +65,5 @@ def main(args: argparse.Namespace): ...@@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
if __name__ == '__main__': if __name__ == '__main__':
parser = FlexibleArgumentParser( args = parse_args()
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args) main(args)
...@@ -23,10 +23,6 @@ import gradio as gr ...@@ -23,10 +23,6 @@ import gradio as gr
from openai import OpenAI from openai import OpenAI
def create_openai_client(api_key, base_url):
return OpenAI(api_key=api_key, base_url=base_url)
def format_history_to_openai(history): def format_history_to_openai(history):
history_openai_format = [{ history_openai_format = [{
"role": "system", "role": "system",
......
...@@ -303,12 +303,7 @@ example_function_map = { ...@@ -303,12 +303,7 @@ example_function_map = {
} }
def main(args) -> None: def parse_args():
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online serving with ' description='Demo on using OpenAI client for online serving with '
'multimodal language models served with vLLM.') 'multimodal language models served with vLLM.')
...@@ -318,5 +313,14 @@ if __name__ == "__main__": ...@@ -318,5 +313,14 @@ if __name__ == "__main__":
default="single-image", default="single-image",
choices=list(example_function_map.keys()), choices=list(example_function_map.keys()),
help='Conversation type with multimodal data.') help='Conversation type with multimodal data.')
args = parser.parse_args() return parser.parse_args()
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
args = parse_args()
main(args) main(args)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
To run this example, you can start the vLLM server To run this example, you can start the vLLM server
without any specific flags: without any specific flags:
```bash ```bash
...@@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \ ...@@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--guided-decoding-backend outlines --guided-decoding-backend outlines
``` ```
This example demonstrates how to generate chat completions This example demonstrates how to generate chat completions
using the OpenAI Python client library. using the OpenAI Python client library.
""" """
...@@ -18,15 +18,6 @@ from openai import OpenAI ...@@ -18,15 +18,6 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [ tools = [
{ {
"type": "function", "type": "function",
...@@ -116,21 +107,36 @@ messages = [ ...@@ -116,21 +107,36 @@ messages = [
}, },
] ]
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)
for chunk in chat_completion: def main():
if chunk.choices and chunk.choices[0].delta.tool_calls: client = OpenAI(
print(chunk.choices[0].delta.tool_calls) # defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)
for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")
print(chat_completion.choices[0].message.tool_calls)
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")
print(chat_completion.choices[0].message.tool_calls) if __name__ == "__main__":
main()
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
An example shows how to generate chat completions from reasoning models An example shows how to generate chat completions from reasoning models
like DeepSeekR1. like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning To run this example, you need to start the vLLM server
parser: with the reasoning parser:
```bash ```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
...@@ -21,35 +21,44 @@ from openai import OpenAI ...@@ -21,35 +21,44 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list() def main():
model = models.data[0].id client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Round 1 models = client.models.list()
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] model = models.data[0].id
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content # Round 1
content = response.choices[0].message.content messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 1:", reasoning_content) reasoning_content = response.choices[0].message.reasoning_content
print("content for Round 1:", content) content = response.choices[0].message.content
# Round 2 print("reasoning_content for Round 1:", reasoning_content)
messages.append({"role": "assistant", "content": content}) print("content for Round 1:", content)
messages.append({
"role": "user",
"content": "How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content # Round 2
content = response.choices[0].message.content messages.append({"role": "assistant", "content": content})
messages.append({
"role":
"user",
"content":
"How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 2:", reasoning_content) reasoning_content = response.choices[0].message.reasoning_content
print("content for Round 2:", content) content = response.choices[0].message.content
print("reasoning_content for Round 2:", reasoning_content)
print("content for Round 2:", content)
if __name__ == "__main__":
main()
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
An example shows how to generate chat completions from reasoning models An example shows how to generate chat completions from reasoning models
like DeepSeekR1. like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning To run this example, you need to start the vLLM server with the reasoning
parser: parser:
```bash ```bash
...@@ -29,41 +29,49 @@ from openai import OpenAI ...@@ -29,41 +29,49 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI( messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] def main():
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` client = OpenAI(
stream = client.chat.completions.create(model=model, api_key=openai_api_key,
messages=messages, base_url=openai_api_base,
stream=True) )
print("client: Start streaming chat completions...") models = client.models.list()
printed_reasoning_content = False model = models.data[0].id
printed_content = False
# ruff: noqa: E501
for chunk in stream: # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
reasoning_content = None stream = client.chat.completions.create(model=model,
content = None messages=messages,
# Check the content is reasoning_content or content stream=True)
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content print("client: Start streaming chat completions...")
elif hasattr(chunk.choices[0].delta, "content"): printed_reasoning_content = False
content = chunk.choices[0].delta.content printed_content = False
if reasoning_content is not None: for chunk in stream:
if not printed_reasoning_content: reasoning_content = None
printed_reasoning_content = True content = None
print("reasoning_content:", end="", flush=True) # Check the content is reasoning_content or content
print(reasoning_content, end="", flush=True) if hasattr(chunk.choices[0].delta, "reasoning_content"):
elif content is not None: reasoning_content = chunk.choices[0].delta.reasoning_content
if not printed_content: elif hasattr(chunk.choices[0].delta, "content"):
printed_content = True content = chunk.choices[0].delta.content
print("\ncontent:", end="", flush=True)
# Extract and print the content if reasoning_content is not None:
print(content, end="", flush=True) if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
if __name__ == "__main__":
main()
...@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict): ...@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
print("Embedding output:", response_json["data"][0]["embedding"]) print("Embedding output:", response_json["data"][0]["embedding"])
if __name__ == '__main__': def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
"Script to call a specified VLM through the API. Make sure to serve " "Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this.") "the model with --task embed before running this.")
...@@ -107,8 +107,10 @@ if __name__ == '__main__': ...@@ -107,8 +107,10 @@ if __name__ == '__main__':
choices=["vlm2vec", "dse_qwen2_vl"], choices=["vlm2vec", "dse_qwen2_vl"],
required=True, required=True,
help="Which model to call.") help="Which model to call.")
args = parser.parse_args() return parser.parse_args()
def main(args):
if args.model == "vlm2vec": if args.model == "vlm2vec":
vlm2vec() vlm2vec()
elif args.model == "dse_qwen2_vl": elif args.model == "dse_qwen2_vl":
...@@ -120,3 +122,8 @@ if __name__ == '__main__': ...@@ -120,3 +122,8 @@ if __name__ == '__main__':
"type": "text", "type": "text",
"content": "What is the weather like today?", "content": "What is the weather like today?",
}) })
if __name__ == '__main__':
args = parse_args()
main(args)
...@@ -6,28 +6,36 @@ from openai import OpenAI ...@@ -6,28 +6,36 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY") def main():
api_key=openai_api_key, client = OpenAI(
base_url=openai_api_base, # defaults to os.environ.get("OPENAI_API_KEY")
) api_key=openai_api_key,
base_url=openai_api_base,
models = client.models.list() )
model = models.data[0].id
models = client.models.list()
# Completion API model = models.data[0].id
stream = False
completion = client.completions.create( # Completion API
model=model, stream = False
prompt="A robot may not injure a human being", completion = client.completions.create(
echo=False, model=model,
n=2, prompt="A robot may not injure a human being",
stream=stream, echo=False,
logprobs=3) n=2,
stream=stream,
print("Completion results:") logprobs=3)
if stream:
for c in completion: print("-" * 50)
print(c) print("Completion results:")
else: if stream:
print(completion) for c in completion:
print(c)
else:
print(completion)
print("-" * 50)
if __name__ == "__main__":
main()
...@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: ...@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return response return response
if __name__ == "__main__": def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000) parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3") parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
return parser.parse_args()
args = parser.parse_args() def main(args):
api_url = f"http://{args.host}:{args.port}/score" api_url = f"http://{args.host}:{args.port}/score"
model_name = args.model model_name = args.model
...@@ -30,9 +32,9 @@ if __name__ == "__main__": ...@@ -30,9 +32,9 @@ if __name__ == "__main__":
text_2 = "The capital of Brazil is Brasilia." text_2 = "The capital of Brazil is Brasilia."
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 and text_2 are both strings:") print("\nPrompt when text_1 and text_2 are both strings:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("Score Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
text_1 = "What is the capital of France?" text_1 = "What is the capital of France?"
...@@ -41,9 +43,9 @@ if __name__ == "__main__": ...@@ -41,9 +43,9 @@ if __name__ == "__main__":
] ]
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 is string and text_2 is a list:") print("\nPrompt when text_1 is string and text_2 is a list:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("Score Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
text_1 = [ text_1 = [
...@@ -54,7 +56,12 @@ if __name__ == "__main__": ...@@ -54,7 +56,12 @@ if __name__ == "__main__":
] ]
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 and text_2 are both lists:") print("\nPrompt when text_1 and text_2 are both lists:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("Score Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
if __name__ == "__main__":
args = parse_args()
main(args)
...@@ -6,22 +6,29 @@ from openai import OpenAI ...@@ -6,22 +6,29 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY") def main():
api_key=openai_api_key, client = OpenAI(
base_url=openai_api_base, # defaults to os.environ.get("OPENAI_API_KEY")
) api_key=openai_api_key,
base_url=openai_api_base,
models = client.models.list() )
model = models.data[0].id
models = client.models.list()
responses = client.embeddings.create( model = models.data[0].id
input=[
"Hello my name is", responses = client.embeddings.create(
"The best thing about vLLM is that it supports many different models" # ruff: noqa: E501
], input=[
model=model, "Hello my name is",
) "The best thing about vLLM is that it supports many different models"
],
for data in responses.data: model=model,
print(data.embedding) # List of float of len 4096 )
for data in responses.data:
print(data.embedding) # List of float of len 4096
if __name__ == "__main__":
main()
...@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: ...@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return response return response
if __name__ == "__main__": def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000) parser.add_argument("--port", type=int, default=8000)
...@@ -25,15 +25,20 @@ if __name__ == "__main__": ...@@ -25,15 +25,20 @@ if __name__ == "__main__":
type=str, type=str,
default="jason9693/Qwen2.5-1.5B-apeach") default="jason9693/Qwen2.5-1.5B-apeach")
args = parser.parse_args() return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/pooling" api_url = f"http://{args.host}:{args.port}/pooling"
model_name = args.model model_name = args.model
# Input like Completions API # Input like Completions API
prompt = {"model": model_name, "input": "vLLM is great!"} prompt = {"model": model_name, "input": "vLLM is great!"}
pooling_response = post_http_request(prompt=prompt, api_url=api_url) pooling_response = post_http_request(prompt=prompt, api_url=api_url)
print("-" * 50)
print("Pooling Response:") print("Pooling Response:")
pprint.pprint(pooling_response.json()) pprint.pprint(pooling_response.json())
print("-" * 50)
# Input like Chat API # Input like Chat API
prompt = { prompt = {
...@@ -50,3 +55,9 @@ if __name__ == "__main__": ...@@ -50,3 +55,9 @@ if __name__ == "__main__":
pooling_response = post_http_request(prompt=prompt, api_url=api_url) pooling_response = post_http_request(prompt=prompt, api_url=api_url)
print("Pooling Response:") print("Pooling Response:")
pprint.pprint(pooling_response.json()) pprint.pprint(pooling_response.json())
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment