Commit 34c31a8d authored by laibao's avatar laibao
Browse files

Update README.md to include detailed information about GLM-4V-9B, its...

Update README.md to include detailed information about GLM-4V-9B, its capabilities, model structure, algorithms, environment setup, inference instructions, and application scenarios.
parent e6dcd9bd
Pipeline #2981 canceled with stages
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
)
return parser.parse_args()
def main(args: Namespace):
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass task="classify" for classification models
model = LLM(**vars(args))
# Generate logits. The output is a list of ClassificationRequestOutputs.
outputs = model.classify(prompts)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
for prompt, output in zip(prompts, outputs):
probs = output.outputs.probs
probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
print(
f"Prompt: {prompt!r} \n"
f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
)
print("-" * 60)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="intfloat/e5-mistral-7b-instruct",
task="embed",
enforce_eager=True,
max_model_len=1024,
)
return parser.parse_args()
def main(args: Namespace):
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding
embeds_trimmed = (
(str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
)
print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
print("-" * 60)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def create_parser():
parser = FlexibleArgumentParser()
# Add engine args
EngineArgs.add_cli_args(parser)
parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
# Add sampling params
sampling_group = parser.add_argument_group("Sampling parameters")
sampling_group.add_argument("--max-tokens", type=int)
sampling_group.add_argument("--temperature", type=float)
sampling_group.add_argument("--top-p", type=float)
sampling_group.add_argument("--top-k", type=int)
return parser
def main(args: dict):
# Pop arguments not used by LLM
max_tokens = args.pop("max_tokens")
temperature = args.pop("temperature")
top_p = args.pop("top_p")
top_k = args.pop("top_k")
# Create an LLM
llm = LLM(**args)
# Create a sampling params object
sampling_params = llm.get_default_sampling_params()
if max_tokens is not None:
sampling_params.max_tokens = max_tokens
if temperature is not None:
sampling_params.temperature = temperature
if top_p is not None:
sampling_params.top_p = top_p
if top_k is not None:
sampling_params.top_k = top_k
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
if __name__ == "__main__":
parser = create_parser()
args: dict = vars(parser.parse_args())
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
)
return parser.parse_args()
def main(args: Namespace):
# Sample prompts.
text_1 = "What is the capital of France?"
texts_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
# Create an LLM.
# You should pass task="score" for cross-encoder models
model = LLM(**vars(args))
# Generate scores. The output is a list of ScoringRequestOutputs.
outputs = model.score(text_1, texts_2)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
for text_2, output in zip(texts_2, outputs):
score = output.outputs.score
print(f"Pair: {[text_1, text_2]!r} \nScore: {score}")
print("-" * 60)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use Ray Data for data parallel batch inference.
Ray Data is a data processing framework that can handle large datasets
and integrates tightly with vLLM for data-parallel inference.
As of Ray 2.44, Ray Data has a native integration with
vLLM (under ray.data.llm).
Ray Data provides functionality for:
* Reading and writing to cloud storage (S3, GCS, etc.)
* Automatic sharding and load-balancing across a cluster
* Optimized configuration of vLLM using continuous batching
* Compatible with tensor/pipeline parallel inference as well.
Learn more about Ray Data's LLM integration:
https://docs.ray.io/en/latest/data/working-with-llms.html
"""
import ray
from packaging.version import Version
from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
assert Version(ray.__version__) >= Version("2.44.1"), (
"Ray version must be at least 2.44.1"
)
# Uncomment to reduce clutter in stdout
# ray.init(log_to_driver=False)
# ray.data.DataContext.get_current().enable_progress_bars = False
# Read one text file from S3. Ray Data supports reading multiple files
# from cloud storage (such as JSONL, Parquet, CSV, binary format).
ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
print(ds.schema())
size = ds.count()
print(f"Size of dataset: {size} prompts")
# Configure vLLM engine.
config = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
engine_kwargs={
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4096,
"max_model_len": 16384,
},
concurrency=1, # set the number of parallel vLLM replicas
batch_size=64,
)
# Create a Processor object, which will be used to
# do batch inference on the dataset
vllm_processor = build_llm_processor(
config,
preprocess=lambda row: dict(
messages=[
{"role": "system", "content": "You are a bot that responds with haikus."},
{"role": "user", "content": row["text"]},
],
sampling_params=dict(
temperature=0.3,
max_tokens=250,
),
),
postprocess=lambda row: dict(
answer=row["generated_text"],
**row, # This will return all the original columns in the dataset.
),
)
ds = vllm_processor(ds)
# Peek first 10 results.
# NOTE: This is for local testing and debugging. For production use case,
# one should write full result out as shown below.
outputs = ds.take(limit=10)
for output in outputs:
prompt = output["prompt"]
generated_text = output["generated_text"]
print(f"Prompt: {prompt!r}")
print(f"Generated text: {generated_text!r}")
# Write inference output data out as Parquet files to S3.
# Multiple files would be written to the output destination,
# and each task would write one or more files separately.
#
# ds.write_parquet("s3://<your-output-bucket>")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
import json
import random
import string
from vllm import LLM
from vllm.sampling_params import SamplingParams
# This script is an offline demo for function calling
#
# If you want to run a server/client setup, please follow this code:
#
# - Server:
#
# ```bash
# vllm serve mistralai/Mistral-7B-Instruct-v0.3 --tokenizer-mode mistral --load-format mistral --config-format mistral
# ```
#
# - Client:
#
# ```bash
# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
# --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \
# --data '{
# "model": "mistralai/Mistral-7B-Instruct-v0.3"
# "messages": [
# {
# "role": "user",
# "content": [
# {"type" : "text", "text": "Describe this image in detail please."},
# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
# {"type" : "text", "text": "and this one as well. Answer in French."},
# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
# ]
# }
# ]
# }'
# ```
#
# Usage:
# python demo.py simple
# python demo.py advanced
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
# or switch to "mistralai/Mistral-Nemo-Instruct-2407"
# or "mistralai/Mistral-Large-Instruct-2407"
# or any other mistral model with function calling ability
sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
llm = LLM(
model=model_name,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
)
def generate_random_id(length=9):
characters = string.ascii_letters + string.digits
random_id = "".join(random.choice(characters) for _ in range(length))
return random_id
# simulate an API that can be called
def get_current_weather(city: str, state: str, unit: "str"):
return (
f"The weather in {city}, {state} is 85 degrees {unit}. It is "
"partly cloudly, with highs in the 90's."
)
tool_functions = {"get_current_weather": get_current_weather}
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
}
]
messages = [
{
"role": "user",
"content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
}
]
outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
output = outputs[0].outputs[0].text.strip()
# append the assistant message
messages.append(
{
"role": "assistant",
"content": output,
}
)
# let's now actually parse and execute the model's output simulating an API call by using the
# above defined function
tool_calls = json.loads(output)
tool_answers = [
tool_functions[call["name"]](**call["arguments"]) for call in tool_calls
]
# append the answer as a tool message and let the LLM give you an answer
messages.append(
{
"role": "tool",
"content": "\n\n".join(tool_answers),
"tool_call_id": generate_random_id(),
}
)
outputs = llm.chat(messages, sampling_params, tools=tools)
print(outputs[0].outputs[0].text.strip())
# yields
# 'The weather in Dallas, TX is 85 degrees fahrenheit. '
# 'It is partly cloudly, with highs in the 90's.'
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to extend the context length
of a Qwen model using the YARN method (rope_scaling)
and run a simple chat example.
Usage:
python examples/offline_inference/context_extension.py
"""
from vllm import LLM, SamplingParams
def create_llm():
rope_theta = 1000000
original_max_position_embeddings = 32768
factor = 4.0
# Use yarn to extend context
hf_overrides = {
"rope_theta": rope_theta,
"rope_scaling": {
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings": original_max_position_embeddings,
},
"max_model_len": int(original_max_position_embeddings * factor),
}
llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
return llm
def run_llm_chat(llm):
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=128,
)
conversation = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hello! How can I assist you today?"},
]
outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
return outputs
def print_outputs(outputs):
print("\nGenerated Outputs:\n" + "-" * 80)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n")
print(f"Generated text: {generated_text!r}")
print("-" * 80)
def main():
llm = create_llm()
outputs = run_llm_chat(llm)
print_outputs(outputs)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Usage:
Single node:
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2
Multi-node:
Node 0 (assume the node has ip of 10.99.48.128):
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2 \
--node-size=2 \
--node-rank=0 \
--master-addr=10.99.48.128 \
--master-port=13345
Node 1:
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2 \
--node-size=2 \
--node-rank=1 \
--master-addr=10.99.48.128 \
--master-port=13345
"""
import os
from time import sleep
from vllm import LLM, SamplingParams
from vllm.utils import get_open_port
def parse_args():
import argparse
parser = argparse.ArgumentParser(description="Data Parallel Inference")
parser.add_argument(
"--model",
type=str,
default="ibm-research/PowerMoE-3b",
help="Model name or path",
)
parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
parser.add_argument(
"--node-size", type=int, default=1, help="Total number of nodes"
)
parser.add_argument(
"--node-rank", type=int, default=0, help="Rank of the current node"
)
parser.add_argument(
"--master-addr", type=str, default="", help="Master node IP address"
)
parser.add_argument("--master-port", type=int, default=0, help="Master node port")
parser.add_argument(
"--enforce-eager", action="store_true", help="Enforce eager mode execution."
)
parser.add_argument(
"--trust-remote-code", action="store_true", help="Trust remote code."
)
parser.add_argument(
"--max-num-seqs",
type=int,
default=64,
help=("Maximum number of sequences to be processed in a single iteration."),
)
parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.8,
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
)
return parser.parse_args()
def main(
model,
dp_size,
local_dp_rank,
global_dp_rank,
dp_master_ip,
dp_master_port,
GPUs_per_dp_rank,
enforce_eager,
trust_remote_code,
max_num_seqs,
gpu_memory_utilization,
):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
os.environ["VLLM_DP_SIZE"] = str(dp_size)
os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
# CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
# engine processes.
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 100
# with DP, each rank should process different prompts.
# usually all the DP ranks process a full dataset,
# and each rank processes a different part of the dataset.
floor = len(prompts) // dp_size
remainder = len(prompts) % dp_size
# Distribute prompts into even groups.
def start(rank):
return rank * floor + min(rank, remainder)
prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)]
if len(prompts) == 0:
# if any rank has no prompts to process,
# we need to set a placeholder prompt
prompts = ["Placeholder"]
print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
# Create a sampling params object.
# since we are doing data parallel, every rank can have different
# sampling params. here we set different max_tokens for different
# ranks for demonstration.
sampling_params = SamplingParams(
temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2]
)
# Create an LLM.
llm = LLM(
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=enforce_eager,
enable_expert_parallel=True,
trust_remote_code=trust_remote_code,
max_num_seqs=max_num_seqs,
gpu_memory_utilization=gpu_memory_utilization,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for i, output in enumerate(outputs):
if i >= 5:
# print only 5 outputs
break
prompt = output.prompt
generated_text = output.outputs[0].text
print(
f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
f"Generated text: {generated_text!r}"
)
# Give engines time to pause their processing loops before exiting.
sleep(1)
if __name__ == "__main__":
args = parse_args()
dp_size = args.dp_size
tp_size = args.tp_size
node_size = args.node_size
node_rank = args.node_rank
if node_size == 1:
dp_master_ip = "127.0.0.1"
dp_master_port = get_open_port()
else:
dp_master_ip = args.master_addr
dp_master_port = args.master_port
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
dp_per_node = dp_size // node_size
from multiprocessing import Process
procs = []
for local_dp_rank, global_dp_rank in enumerate(
range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
):
proc = Process(
target=main,
args=(
args.model,
dp_size,
local_dp_rank,
global_dp_rank,
dp_master_ip,
dp_master_port,
tp_size,
args.enforce_eager,
args.trust_remote_code,
args.max_num_seqs,
args.gpu_memory_utilization,
),
)
proc.start()
procs.append(proc)
exit_code = 0
for proc in procs:
proc.join(timeout=300)
if proc.exitcode is None:
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
proc.kill()
exit_code = 1
elif proc.exitcode:
exit_code = proc.exitcode
exit(exit_code)
# Disaggregated Prefill V1
This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
## Files
- `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
- Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
- `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
- `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
def read_prompts():
"""Read prompts from output.txt"""
prompts = []
try:
with open("output.txt") as f:
for line in f:
prompts.append(line.strip())
print(f"Loaded {len(prompts)} prompts from output.txt")
return prompts
except FileNotFoundError:
print("Error: output.txt file not found")
exit(-1)
def main():
prompts = read_prompts()
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
gpu_memory_utilization=0.8,
max_num_batched_tokens=64,
max_num_seqs=16,
kv_transfer_config=KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
),
) # , max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs = llm.generate(prompts, sampling_params)
print("-" * 30)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 30)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
def read_prompts():
context = "Hi " * 1000
context2 = "Hey " * 500
return [
context + "Hello, my name is",
context + "The capital of France is",
context2 + "Your name is",
context2 + "The capital of China is",
]
def main():
prompts = read_prompts()
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
gpu_memory_utilization=0.8,
kv_transfer_config=KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
),
) # , max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs = llm.generate(
prompts,
sampling_params,
)
new_prompts = []
print("-" * 30)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
new_prompts.append(prompt + generated_text)
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 30)
# Write new_prompts to output.txt
with open("output.txt", "w") as f:
for prompt in new_prompts:
f.write(prompt + "\n")
print(f"Saved {len(new_prompts)} prompts to output.txt")
if __name__ == "__main__":
main()
rm -rf local_storage/
if [ -f "output.txt" ]; then
rm output.txt
fi
# The directory of current script
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of disaggregated prefilling
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and then transfer the KV cache between them.
"""
import os
import time
from multiprocessing import Event, Process
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
def run_prefill(prefill_done):
# We use GPU 0 for prefill node.
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# The prefill node receives two requests, while the decode node receives
# three requests. So the decode node will only receive the KV Cache for
# requests 1 and 3. The decode node will use the KV Cache of requests 1
# and 3 and do prefilling on request 2.
prompts = [
"Hello, my name is",
"Hi, your name is",
# The decode node will actually "prefill" this request.
"Tell me a very long story",
]
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
# Using PyNcclConnector to transmit KV caches between vLLM instances.
# This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc = KVTransferConfig(
kv_connector="PyNcclConnector",
kv_role="kv_producer",
kv_rank=0,
kv_parallel_size=2,
)
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
llm = LLM(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
kv_transfer_config=ktc,
max_model_len=2000,
gpu_memory_utilization=0.8,
)
llm.generate(prompts, sampling_params)
print("Prefill node is finished.")
prefill_done.set()
# To keep the prefill node running in case the decode node is not done;
# otherwise, the script might exit prematurely, causing incomplete decoding.
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("Script stopped by user.")
def run_decode(prefill_done):
# We use GPU 1 for decode node.
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
prompts = [
"Hello, my name is",
"Hi, your name is",
"Tell me a very long story",
]
sampling_params = SamplingParams(temperature=0, top_p=0.95)
# Using PyNcclConnector to transmit KV caches between vLLM instances.
# This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc = KVTransferConfig(
kv_connector="PyNcclConnector",
kv_role="kv_consumer",
kv_rank=1,
kv_parallel_size=2,
)
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
llm = LLM(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
kv_transfer_config=ktc,
max_model_len=2000,
gpu_memory_utilization=0.8,
)
# Wait for the producer to start the pipe
print("Waiting for prefill node to finish...")
prefill_done.wait()
# At this point when the prefill_done is set, the kv-cache should have been
# transferred to this decode node, so we can start decoding.
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
def main():
prefill_done = Event()
prefill_process = Process(target=run_prefill, args=(prefill_done,))
decode_process = Process(target=run_decode, args=(prefill_done,))
# Start prefill node
prefill_process.start()
# Start decode node
decode_process.start()
# Terminate the prefill node when decode is finished
decode_process.join()
prefill_process.terminate()
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
)
return parser.parse_args()
def main(args: Namespace):
# Sample prompts.
prompts = [
"Follow the white rabbit.", # English
"Sigue al conejo blanco.", # Spanish
"Suis le lapin blanc.", # French
"跟着白兔走。", # Chinese
"اتبع الأرنب الأبيض.", # Arabic
"Folge dem weißen Kaninchen.", # German
]
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
# Only text matching task is supported for now. See #16120
outputs = model.embed(prompts)
# Print the outputs.
print("\nGenerated Outputs:")
print("Only text matching task is supported for now. See #16120")
print("-" * 60)
for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding
embeds_trimmed = (
(str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
)
print(
f"Prompt: {prompt!r} \n"
f"Embeddings for text matching: {embeds_trimmed} "
f"(size={len(embeds)})"
)
print("-" * 60)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from argparse import Namespace
from vllm import LLM, EngineArgs, PoolingParams
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
)
return parser.parse_args()
def main(args: Namespace):
# Sample prompts.
prompts = [
"Follow the white rabbit.", # English
"Sigue al conejo blanco.", # Spanish
"Suis le lapin blanc.", # French
"跟着白兔走。", # Chinese
"اتبع الأرنب الأبيض.", # Arabic
"Folge dem weißen Kaninchen.", # German
]
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
# Print the outputs.
print("\nGenerated Outputs:")
print("-" * 60)
for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding
embeds_trimmed = (
(str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
)
print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
print("-" * 60)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART
"""
from vllm import LLM, SamplingParams
from vllm.inputs import (
ExplicitEncoderDecoderPrompt,
TextPrompt,
TokensPrompt,
zip_enc_dec_prompts,
)
def create_prompts(tokenizer):
# Test prompts
#
# This section shows all of the valid ways to prompt an
# encoder/decoder model.
#
# - Helpers for building prompts
text_prompt_raw = "Hello, my name is"
text_prompt = TextPrompt(prompt="The president of the United States is")
tokens_prompt = TokensPrompt(
prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
)
# - Pass a single prompt to encoder/decoder model
# (implicitly encoder input prompt);
# decoder input prompt is assumed to be None
single_text_prompt_raw = text_prompt_raw # Pass a string directly
single_text_prompt = text_prompt # Pass a TextPrompt
single_tokens_prompt = tokens_prompt # Pass a TokensPrompt
# ruff: noqa: E501
# - Pass explicit encoder and decoder input prompts within one data structure.
# Encoder and decoder prompts can both independently be text or tokens, with
# no requirement that they be the same prompt type. Some example prompt-type
# combinations are shown below, note that these are not exhaustive.
enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
# Pass encoder prompt string directly, &
# pass decoder prompt tokens
encoder_prompt=single_text_prompt_raw,
decoder_prompt=single_tokens_prompt,
)
enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
# Pass TextPrompt to encoder, and
# pass decoder prompt string directly
encoder_prompt=single_text_prompt,
decoder_prompt=single_text_prompt_raw,
)
enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
# Pass encoder prompt tokens directly, and
# pass TextPrompt to decoder
encoder_prompt=single_tokens_prompt,
decoder_prompt=single_text_prompt,
)
# - Finally, here's a useful helper function for zipping encoder and
# decoder prompts together into a list of ExplicitEncoderDecoderPrompt
# instances
zipped_prompt_list = zip_enc_dec_prompts(
["An encoder prompt", "Another encoder prompt"],
["A decoder prompt", "Another decoder prompt"],
)
# - Let's put all of the above example prompts together into one list
# which we will pass to the encoder/decoder LLM.
return [
single_text_prompt_raw,
single_text_prompt,
single_tokens_prompt,
enc_dec_prompt1,
enc_dec_prompt2,
enc_dec_prompt3,
] + zipped_prompt_list
# Create a sampling params object.
def create_sampling_params():
return SamplingParams(
temperature=0,
top_p=1.0,
min_tokens=0,
max_tokens=20,
)
# Print the outputs.
def print_outputs(outputs):
print("-" * 50)
for i, output in enumerate(outputs):
prompt = output.prompt
encoder_prompt = output.encoder_prompt
generated_text = output.outputs[0].text
print(f"Output {i + 1}:")
print(
f"Encoder prompt: {encoder_prompt!r}\n"
f"Decoder prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}"
)
print("-" * 50)
def main():
dtype = "float"
# Create a BART encoder/decoder model instance
llm = LLM(
model="facebook/bart-large-cnn",
dtype=dtype,
)
# Get BART tokenizer
tokenizer = llm.llm_engine.get_tokenizer_group()
prompts = create_prompts(tokenizer)
sampling_params = create_sampling_params()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)
print_outputs(outputs)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import time
from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuple
from vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.utils import FlexibleArgumentParser
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompts: Sequence[PromptType]
def run_florence2():
engine_args = EngineArgs(
model="microsoft/Florence-2-large",
tokenizer="Isotr0py/Florence-2-tokenizer",
max_num_seqs=8,
trust_remote_code=True,
limit_mm_per_prompt={"image": 1},
dtype="half",
)
prompts = [
{ # implicit prompt with task token
"prompt": "<DETAILED_CAPTION>",
"multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
},
{ # explicit encoder/decoder prompt
"encoder_prompt": {
"prompt": "Describe in detail what is shown in the image.",
"multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
},
"decoder_prompt": "",
},
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_mllama():
engine_args = EngineArgs(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"image": 1},
dtype="half",
)
prompts = [
{ # Implicit prompt
"prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501
"multi_modal_data": {
"image": ImageAsset("stop_sign").pil_image,
},
},
{ # Explicit prompt
"encoder_prompt": {
"prompt": "<|image|>",
"multi_modal_data": {
"image": ImageAsset("stop_sign").pil_image,
},
},
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
},
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_whisper():
engine_args = EngineArgs(
model="openai/whisper-large-v3-turbo",
max_model_len=448,
max_num_seqs=16,
limit_mm_per_prompt={"audio": 1},
dtype="half",
)
prompts = [
{ # Test implicit prompt
"prompt": "<|startoftranscript|>",
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
},
},
{ # Test explicit encoder/decoder prompt
"encoder_prompt": {
"prompt": "",
"multi_modal_data": {
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt": "<|startoftranscript|>",
},
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
model_example_map = {
"florence2": run_florence2,
"mllama": run_mllama,
"whisper": run_whisper,
}
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"vision language models for text generation"
)
parser.add_argument(
"--model-type",
"-m",
type=str,
default="mllama",
choices=model_example_map.keys(),
help='Huggingface "model_type".',
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
def main(args):
model = args.model_type
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
req_data = model_example_map[model]()
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
prompts = req_data.prompts
# Create a sampling params object.
sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
max_tokens=64,
skip_special_tokens=False,
)
start = time.time()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
duration = time.time() - start
print("Duration:", duration)
print("RPS:", len(prompts) / duration)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates using the `LLMEngine`
for processing prompts with various sampling parameters.
"""
import argparse
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils import FlexibleArgumentParser
def create_test_prompts() -> list[tuple[str, SamplingParams]]:
"""Create a list of test prompts with their sampling parameters."""
return [
(
"A robot may not injure a human being",
SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1),
),
(
"To be or not to be,",
SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2),
),
(
"What is the meaning of life?",
SamplingParams(n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1),
),
]
def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
print("-" * 50)
while test_prompts or engine.has_unfinished_requests():
if test_prompts:
prompt, sampling_params = test_prompts.pop(0)
engine.add_request(str(request_id), prompt, sampling_params)
request_id += 1
request_outputs: list[RequestOutput] = engine.step()
for request_output in request_outputs:
if request_output.finished:
print(request_output)
print("-" * 50)
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
"""Initialize the LLMEngine from the command line arguments."""
engine_args = EngineArgs.from_cli_args(args)
return LLMEngine.from_engine_args(engine_args)
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using the LLMEngine class directly"
)
parser = EngineArgs.add_cli_args(parser)
return parser.parse_args()
def main(args: argparse.Namespace):
"""Main function that sets up and runs the prompt processing."""
engine = initialize_engine(args)
test_prompts = create_test_prompts()
process_requests(engine, test_prompts)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Validates the loading of a model saved with the sharded_state format.
This script demonstrates how to load a model that was previously saved
using save_sharded_state.py and validates it by running inference.
Example usage:
(First need to save a sharded_state mode)
python save_sharded_state.py \
--model /path/to/load \
--quantization deepspeedfp \
--tensor-parallel-size 8 \
--output /path/to/save/sharded/modele
python load_sharded_state.py \
--model /path/to/saved/sharded/model \
--load-format sharded_state \
--quantization deepspeedfp \
--tensor-parallel-size 8 \
--prompt "Hello, my name is" \
--max-tokens 50
"""
import dataclasses
from vllm import LLM, EngineArgs, SamplingParams
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
# Add engine arguments
EngineArgs.add_cli_args(parser)
# Override default load_format for clarity
parser.set_defaults(load_format="sharded_state")
# Add validation arguments
parser.add_argument(
"--prompt", type=str, default="Hello, world!", help="Prompt for validation"
)
parser.add_argument(
"--max-tokens",
type=int,
default=100,
help="Maximum number of tokens to generate",
)
parser.add_argument(
"--temperature", type=float, default=0.7, help="Sampling temperature"
)
parser.add_argument(
"--top-p", type=float, default=1.0, help="Top-p sampling parameter"
)
return parser.parse_args()
def main():
args = parse_args()
engine_args = EngineArgs.from_cli_args(args)
print(
f"Loading model from {engine_args.model} using format {engine_args.load_format}"
)
print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
# Load the model using engine args
llm = LLM(**dataclasses.asdict(engine_args))
# Prepare sampling parameters
sampling_params = SamplingParams(
temperature=args.temperature,
top_p=args.top_p,
max_tokens=args.max_tokens,
)
print("\nRunning inference:")
print(f"Prompt: {args.prompt}")
# Generate completion
outputs = llm.generate(args.prompt, sampling_params)
# Display generated text
print("\nGenerated outputs:")
for output in outputs:
generated_text = output.outputs[0].text
print("-" * 50)
print(f"Full output: {args.prompt}{generated_text}")
print("-" * 50)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use LoRA with different quantization techniques
for offline inference.
Requires HuggingFace credentials for access.
"""
import gc
from typing import Optional
import torch
from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.lora.request import LoRARequest
def create_test_prompts(
lora_path: str,
) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
return [
# this is an example of using quantization without LoRA
(
"My name is",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
None,
),
# the next three examples use quantization with LoRA
(
"my name is",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
LoRARequest("lora-test-1", 1, lora_path),
),
(
"The capital of USA is",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
LoRARequest("lora-test-2", 1, lora_path),
),
(
"The capital of France is",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
LoRARequest("lora-test-3", 1, lora_path),
),
]
def process_requests(
engine: LLMEngine,
test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
while test_prompts or engine.has_unfinished_requests():
if test_prompts:
prompt, sampling_params, lora_request = test_prompts.pop(0)
engine.add_request(
str(request_id), prompt, sampling_params, lora_request=lora_request
)
request_id += 1
request_outputs: list[RequestOutput] = engine.step()
for request_output in request_outputs:
if request_output.finished:
print("----------------------------------------------------")
print(f"Prompt: {request_output.prompt}")
print(f"Output: {request_output.outputs[0].text}")
def initialize_engine(
model: str, quantization: str, lora_repo: Optional[str]
) -> LLMEngine:
"""Initialize the LLMEngine."""
engine_args = EngineArgs(
model=model,
quantization=quantization,
enable_lora=True,
max_lora_rank=64,
max_loras=4,
)
return LLMEngine.from_engine_args(engine_args)
def main():
"""Main function that sets up and runs the prompt processing."""
test_configs = [
# QLoRA (https://arxiv.org/abs/2305.14314)
{
"name": "qlora_inference_example",
"model": "huggyllama/llama-7b",
"quantization": "bitsandbytes",
"lora_repo": "timdettmers/qlora-flan-7b",
},
{
"name": "AWQ_inference_with_lora_example",
"model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
"quantization": "awq",
"lora_repo": "jashing/tinyllama-colorist-lora",
},
{
"name": "GPTQ_inference_with_lora_example",
"model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
"quantization": "gptq",
"lora_repo": "jashing/tinyllama-colorist-lora",
},
]
for test_config in test_configs:
print(f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~")
engine = initialize_engine(
test_config["model"], test_config["quantization"], test_config["lora_repo"]
)
lora_path = snapshot_download(repo_id=test_config["lora_repo"])
test_prompts = create_test_prompts(lora_path)
process_requests(engine, test_prompts)
# Clean up the GPU memory for the next test
del engine
gc.collect()
torch.cuda.empty_cache()
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment