Commit 006693ed authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.11.2' into v0.11.2-ori

parents 4b51e6f1 275de341
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for embedding API using vLLM API server
NOTE:
start a supported embeddings model server with `vllm serve`, e.g.
vllm serve intfloat/e5-small
"""
import argparse
import base64
import requests
import torch
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
binary2tensor,
)
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
response = requests.post(api_url, headers=headers, json=prompt)
return response
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="intfloat/e5-small")
return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/v1/embeddings"
model_name = args.model
# The OpenAI client does not support the embed_dtype and endianness parameters.
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
prompt = {
"model": model_name,
"input": "vLLM is great!",
"encoding_format": "base64",
"embed_dtype": embed_dtype,
"endianness": endianness,
}
response = post_http_request(prompt=prompt, api_url=api_url)
embedding = []
for data in response.json()["data"]:
binary = base64.b64decode(data["embedding"])
tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
embedding.append(tensor.to(torch.float32))
embedding = torch.cat(embedding)
print(embed_dtype, endianness, embedding.shape)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for embedding API using vLLM API server
NOTE:
start a supported embeddings model server with `vllm serve`, e.g.
vllm serve intfloat/e5-small
"""
import argparse
import json
import requests
import torch
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
MetadataItem,
decode_pooling_output,
)
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
response = requests.post(api_url, headers=headers, json=prompt)
return response
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="intfloat/e5-small")
return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/v1/embeddings"
model_name = args.model
# The OpenAI client does not support the bytes encoding_format.
# The OpenAI client does not support the embed_dtype and endianness parameters.
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
prompt = {
"model": model_name,
"input": "vLLM is great!",
"encoding_format": "bytes",
"embed_dtype": embed_dtype,
"endianness": endianness,
}
response = post_http_request(prompt=prompt, api_url=api_url)
metadata = json.loads(response.headers["metadata"])
body = response.content
items = [MetadataItem(**x) for x in metadata["data"]]
embedding = decode_pooling_output(items=items, body=body)
embedding = [x.to(torch.float32) for x in embedding]
embedding = torch.cat(embedding)
print(embed_dtype, endianness, embedding.shape)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example online usage of Pooling API for multi vector retrieval.
Run `vllm serve <model> --runner pooling`
to start up the server in vLLM. e.g.
vllm serve BAAI/bge-m3
"""
import argparse
import requests
import torch
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
response = requests.post(api_url, headers=headers, json=prompt)
return response
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="BAAI/bge-m3")
return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/pooling"
model_name = args.model
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompt = {"model": model_name, "input": prompts}
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
for output in pooling_response.json()["data"]:
multi_vector = torch.tensor(output["data"])
print(multi_vector.shape)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501 # ruff: noqa: E501
"""Example Python client for multimodal embedding API using vLLM API server """Example Python client for multimodal embedding API using vLLM API server.
NOTE:
start a supported multimodal embeddings model server with `vllm serve`, e.g. Refer to each `run_*` function for the command to run the server for that model.
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
""" """
import argparse import argparse
import base64 import base64
import io import io
from typing import Literal
import requests from openai import OpenAI
from openai._types import NOT_GIVEN, NotGiven
from openai.types.chat import ChatCompletionMessageParam
from openai.types.create_embedding_response import CreateEmbeddingResponse
from PIL import Image from PIL import Image
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
def vlm2vec(): def create_chat_embeddings(
response = requests.post( client: OpenAI,
"http://localhost:8000/v1/embeddings", *,
json={ messages: list[ChatCompletionMessageParam],
"model": "TIGER-Lab/VLM2Vec-Full", model: str,
"messages": [ encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
{ ) -> CreateEmbeddingResponse:
"role": "user", """
"content": [ Convenience function for accessing vLLM's Chat Embeddings API,
{"type": "image_url", "image_url": {"url": image_url}}, which is an extension of OpenAI's existing Embeddings API.
{"type": "text", "text": "Represent the given image."}, """
], return client.post(
} "/embeddings",
], cast_to=CreateEmbeddingResponse,
"encoding_format": "float", body={"messages": messages, "model": model, "encoding_format": encoding_format},
},
) )
response.raise_for_status()
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"])
def run_clip(client: OpenAI, model: str):
"""
Start the server using:
vllm serve openai/clip-vit-base-patch32 \
--runner pooling
"""
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
],
}
],
model=model,
encoding_format="float",
)
print("Image embedding output:", response.data[0].embedding)
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "a photo of a cat"},
],
}
],
model=model,
encoding_format="float",
)
print("Text embedding output:", response.data[0].embedding)
def run_dse_qwen2_vl(client: OpenAI, model: str):
"""
Start the server using:
def dse_qwen2_vl(inp: dict): vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
# Embedding an Image --runner pooling \
if inp["type"] == "image": --trust-remote-code \
messages = [ --max-model-len 8192 \
--chat-template examples/template_dse_qwen2_vl.jinja
"""
response = create_chat_embeddings(
client,
messages=[
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": inp["image_url"], "url": image_url,
}, },
}, },
{"type": "text", "text": "What is shown in this image?"}, {"type": "text", "text": "What is shown in this image?"},
], ],
} }
] ],
# Embedding a Text Query model=model,
else: encoding_format="float",
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image )
# of the minimum input size
buffer = io.BytesIO() print("Image embedding output:", response.data[0].embedding)
image_placeholder = Image.new("RGB", (56, 56))
image_placeholder.save(buffer, "png") # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
buffer.seek(0) # of the minimum input size
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8") buffer = io.BytesIO()
messages = [ image_placeholder = Image.new("RGB", (56, 56))
image_placeholder.save(buffer, "png")
buffer.seek(0)
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
response = create_chat_embeddings(
client,
messages=[
{ {
"role": "user", "role": "user",
"content": [ "content": [
...@@ -76,23 +134,129 @@ def dse_qwen2_vl(inp: dict): ...@@ -76,23 +134,129 @@ def dse_qwen2_vl(inp: dict):
"url": f"data:image/jpeg;base64,{image_placeholder}", "url": f"data:image/jpeg;base64,{image_placeholder}",
}, },
}, },
{"type": "text", "text": f"Query: {inp['content']}"}, {"type": "text", "text": "Query: What is the weather like today?"},
],
}
],
model=model,
encoding_format="float",
)
print("Text embedding output:", response.data[0].embedding)
def run_siglip(client: OpenAI, model: str):
"""
Start the server using:
vllm serve google/siglip-base-patch16-224 \
--runner pooling
"""
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
], ],
} }
] ],
model=model,
response = requests.post( encoding_format="float",
"http://localhost:8000/v1/embeddings",
json={
"model": "MrLight/dse-qwen2-2b-mrl-v1",
"messages": messages,
"encoding_format": "float",
},
) )
response.raise_for_status()
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"]) print("Image embedding output:", response.data[0].embedding)
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "a photo of a cat"},
],
}
],
model=model,
encoding_format="float",
)
print("Text embedding output:", response.data[0].embedding)
def run_vlm2vec(client: OpenAI, model: str):
"""
Start the server using:
vllm serve TIGER-Lab/VLM2Vec-Full \
--runner pooling \
--trust-remote-code \
--max-model-len 4096 \
--chat-template examples/template_vlm2vec_phi3v.jinja
"""
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Represent the given image."},
],
}
],
model=model,
encoding_format="float",
)
print("Image embedding output:", response.data[0].embedding)
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{
"type": "text",
"text": "Represent the given image with the following question: What is in the image.",
},
],
}
],
model=model,
encoding_format="float",
)
print("Image+Text embedding output:", response.data[0].embedding)
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "A cat and a dog"},
],
}
],
model=model,
encoding_format="float",
)
print("Text embedding output:", response.data[0].embedding)
model_example_map = {
"clip": run_clip,
"dse_qwen2_vl": run_dse_qwen2_vl,
"siglip": run_siglip,
"vlm2vec": run_vlm2vec,
}
def parse_args(): def parse_args():
...@@ -103,29 +267,24 @@ def parse_args(): ...@@ -103,29 +267,24 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--model", "--model",
type=str, type=str,
choices=["vlm2vec", "dse_qwen2_vl"], choices=model_example_map.keys(),
required=True, required=True,
help="Which model to call.", help="The name of the embedding model.",
) )
return parser.parse_args() return parser.parse_args()
def main(args): def main(args):
if args.model == "vlm2vec": client = OpenAI(
vlm2vec() # defaults to os.environ.get("OPENAI_API_KEY")
elif args.model == "dse_qwen2_vl": api_key=openai_api_key,
dse_qwen2_vl( base_url=openai_api_base,
{ )
"type": "image",
"image_url": image_url, models = client.models.list()
} model_id = models.data[0].id
)
dse_qwen2_vl( model_example_map[args.model](client, model_id)
{
"type": "text",
"content": "What is the weather like today?",
}
)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -11,14 +11,15 @@ import requests ...@@ -11,14 +11,15 @@ import requests
# image as input, process it using the multimodal data processor, and # image as input, process it using the multimodal data processor, and
# perform inference. # perform inference.
# Requirements : # Requirements :
# - install plugin at: # - install TerraTorch v1.1 (or later):
# https://github.com/christian-pinto/prithvi_io_processor_plugin # pip install terratorch>=v1.1
# - start vllm in serving mode with the below args # - start vllm in serving mode with the below args
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM' # --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model-impl terratorch # --model-impl terratorch
# --task embed --trust-remote-code # --task embed --trust-remote-code
# --skip-tokenizer-init --enforce-eager # --skip-tokenizer-init --enforce-eager
# --io-processor-plugin prithvi_to_tiff # --io-processor-plugin terratorch_segmentation
# --enable-mm-embeds
def main(): def main():
...@@ -34,7 +35,6 @@ def main(): ...@@ -34,7 +35,6 @@ def main():
}, },
"priority": 0, "priority": 0,
"model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM", "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
"softmax": False,
} }
ret = requests.post(server_endpoint, json=request_payload_url) ret = requests.post(server_endpoint, json=request_payload_url)
......
...@@ -852,7 +852,7 @@ ...@@ -852,7 +852,7 @@
"uid": "${DS_PROMETHEUS}" "uid": "${DS_PROMETHEUS}"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", "expr": "vllm:kv_cache_usage_perc{model_name=\"$model_name\"}",
"instant": false, "instant": false,
"legendFormat": "GPU Cache Usage", "legendFormat": "GPU Cache Usage",
"range": true, "range": true,
......
...@@ -36,7 +36,6 @@ llm_config = LLMConfig( ...@@ -36,7 +36,6 @@ llm_config = LLMConfig(
}, },
# Set to the node's accelerator type. # Set to the node's accelerator type.
accelerator_type="H100", accelerator_type="H100",
runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
# Customize engine arguments as required (for example, vLLM engine kwargs). # Customize engine arguments as required (for example, vLLM engine kwargs).
engine_kwargs={ engine_kwargs={
"tensor_parallel_size": 8, "tensor_parallel_size": 8,
......
...@@ -83,6 +83,29 @@ else ...@@ -83,6 +83,29 @@ else
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379" RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
fi fi
# Parse VLLM_HOST_IP from additional args if present.
# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
VLLM_HOST_IP=""
for arg in "${ADDITIONAL_ARGS[@]}"; do
if [[ $arg == "-e" ]]; then
continue
fi
if [[ $arg == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
break
fi
done
# Build Ray IP environment variables if VLLM_HOST_IP is set.
# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
RAY_IP_VARS=()
if [ -n "${VLLM_HOST_IP}" ]; then
RAY_IP_VARS=(
-e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
)
fi
# Launch the container with the assembled parameters. # Launch the container with the assembled parameters.
# --network host: Allows Ray nodes to communicate directly via host networking # --network host: Allows Ray nodes to communicate directly via host networking
# --shm-size 10.24g: Increases shared memory # --shm-size 10.24g: Increases shared memory
...@@ -95,5 +118,6 @@ docker run \ ...@@ -95,5 +118,6 @@ docker run \
--shm-size 10.24g \ --shm-size 10.24g \
--gpus all \ --gpus all \
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
"${RAY_IP_VARS[@]}" \
"${ADDITIONAL_ARGS[@]}" \ "${ADDITIONAL_ARGS[@]}" \
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}" "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
...@@ -21,4 +21,4 @@ while IFS='=' read -r key value; do ...@@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
done < <(env | grep "^${PREFIX}") done < <(env | grep "^${PREFIX}")
# Pass the collected arguments to the main entrypoint # Pass the collected arguments to the main entrypoint
exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}" exec vllm serve "${ARGS[@]}"
\ No newline at end of file \ No newline at end of file
...@@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None ...@@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None
for chunk in response: for chunk in response:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
# Stream reasoning first # Stream reasoning first
if reason and hasattr(delta, "reasoning_content") and live_think: if reason and hasattr(delta, "reasoning") and live_think:
rc = delta.reasoning_content rc = delta.reasoning
if rc: if rc:
think_text += rc think_text += rc
live_think.markdown(think_text + "▌") live_think.markdown(think_text + "▌")
...@@ -262,8 +262,8 @@ def server_supports_reasoning(): ...@@ -262,8 +262,8 @@ def server_supports_reasoning():
messages=[{"role": "user", "content": "Hi"}], messages=[{"role": "user", "content": "Hi"}],
stream=False, stream=False,
) )
return hasattr(resp.choices[0].message, "reasoning_content") and bool( return hasattr(resp.choices[0].message, "reasoning") and bool(
resp.choices[0].message.reasoning_content resp.choices[0].message.reasoning
) )
......
...@@ -21,7 +21,7 @@ If you want to run this script standalone with `uv`, you can use the following: ...@@ -21,7 +21,7 @@ If you want to run this script standalone with `uv`, you can use the following:
```bash ```bash
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \ uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
structured-output structured-outputs
``` ```
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information. See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
......
[project] [project]
name = "examples-online-structured-outputs" name = "examples-online-structured-outputs"
requires-python = ">=3.9, <3.13" requires-python = ">=3.10, <3.14"
dependencies = ["openai==1.78.1", "pydantic==2.11.4"] dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
version = "0.0.0" version = "0.0.0"
......
# ruff: noqa: E501 # ruff: noqa: E501
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import argparse import argparse
import asyncio import asyncio
import enum import enum
import os import os
from typing import TYPE_CHECKING, Any, Literal from typing import Any, Literal
import openai import openai
import pydantic import pydantic
from openai.types.chat import ChatCompletionChunk
if TYPE_CHECKING:
from openai.types.chat import ChatCompletionChunk
ConstraintsFormat = Literal[ ConstraintsFormat = Literal[
"choice", "choice",
...@@ -39,7 +33,7 @@ async def print_stream_response( ...@@ -39,7 +33,7 @@ async def print_stream_response(
async for chunk in stream_response: async for chunk in stream_response:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None) reasoning_chunk_text: str | None = getattr(delta, "reasoning", None)
content_chunk_text = delta.content content_chunk_text = delta.content
if args.reasoning: if args.reasoning:
...@@ -261,8 +255,8 @@ async def cli(): ...@@ -261,8 +255,8 @@ async def cli():
for constraint, response in zip(constraints, results): for constraint, response in zip(constraints, results):
print(f"\n\n{constraint}:") print(f"\n\n{constraint}:")
message = response.choices[0].message message = response.choices[0].message
if args.reasoning and hasattr(message, "reasoning_content"): if args.reasoning and hasattr(message, "reasoning"):
print(f" Reasoning: {message.reasoning_content or ''}") print(f" Reasoning: {message.reasoning or ''}")
print(f" Content: {message.content!r}") print(f" Content: {message.content!r}")
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
from transformers import AutoTokenizer
GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
DUMMY_API_KEY = "empty"
MODEL_NAME = "Qwen/Qwen3-0.6B"
transport = httpx.HTTPTransport()
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
client = httpx.Client(
transport=transport,
base_url=GEN_ENDPOINT,
timeout=600,
headers=headers,
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "How many countries are in the EU?"},
]
def main(client):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False,
)
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
"stream": False,
}
resp = client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
print(data)
print("-" * 50)
print("Token generation results:")
res = tokenizer.decode(data["choices"][0]["token_ids"])
print(res)
print("-" * 50)
if __name__ == "__main__":
main(client)
...@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig ...@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
def setup_environment_variables(vllm_version: str): def setup_environment_variables():
# LMCache-related environment variables # LMCache-related environment variables
# Use experimental features in LMCache # Use experimental features in LMCache
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
...@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str): ...@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
os.environ["LMCACHE_LOCAL_CPU"] = "True" os.environ["LMCACHE_LOCAL_CPU"] = "True"
# Set local CPU memory limit to 5.0 GB # Set local CPU memory limit to 5.0 GB
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
if vllm_version == "v0":
os.environ["VLLM_USE_V1"] = "0"
@contextlib.contextmanager @contextlib.contextmanager
def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str): def build_llm_with_lmcache(lmcache_connector: str, model: str):
ktc = KVTransferConfig( ktc = KVTransferConfig(
kv_connector=lmcache_connector, kv_connector=lmcache_connector,
kv_role="kv_both", kv_role="kv_both",
...@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str ...@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory. # memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392). # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
if vllm_version == "v0": llm_args = EngineArgs(
llm_args = EngineArgs( model=model,
model=model, kv_transfer_config=ktc,
kv_transfer_config=ktc, max_model_len=8000,
max_model_len=8000, gpu_memory_utilization=0.8,
gpu_memory_utilization=0.8, )
enable_chunked_prefill=True, # Only in v0
)
else:
llm_args = EngineArgs(
model=model,
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
)
llm = LLM(**asdict(llm_args)) llm = LLM(**asdict(llm_args))
try: try:
...@@ -116,18 +105,10 @@ def parse_args(): ...@@ -116,18 +105,10 @@ def parse_args():
def main(): def main():
args = parse_args() lmcache_connector = "LMCacheConnectorV1"
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
if args.version == "v0": setup_environment_variables()
lmcache_connector = "LMCacheConnector" with build_llm_with_lmcache(lmcache_connector, model) as llm:
model = "mistralai/Mistral-7B-Instruct-v0.2"
else:
lmcache_connector = "LMCacheConnectorV1"
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables(args.version)
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
# This example script runs two requests with a shared prefix. # This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts # Define the shared prompt and specific prompts
shared_prompt = "Hello, how are you?" * 1000 shared_prompt = "Hello, how are you?" * 1000
......
...@@ -16,13 +16,11 @@ from vllm.model_executor.model_loader.tensorizer import ( ...@@ -16,13 +16,11 @@ from vllm.model_executor.model_loader.tensorizer import (
tensorize_vllm_model, tensorize_vllm_model,
tensorizer_kwargs_arg, tensorizer_kwargs_arg,
) )
from vllm.utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = logging.getLogger() logger = logging.getLogger()
# yapf conflicts with isort for this docstring
# yapf: disable
""" """
tensorize_vllm_model.py is a script that can be used to serialize and tensorize_vllm_model.py is a script that can be used to serialize and
deserialize vLLM models. These models can be loaded using tensorizer deserialize vLLM models. These models can be loaded using tensorizer
...@@ -86,7 +84,7 @@ directly to load models: ...@@ -86,7 +84,7 @@ directly to load models:
from vllm import LLM from vllm import LLM
llm = LLM( llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer" load_format="tensorizer",
) )
``` ```
...@@ -132,7 +130,8 @@ def get_parser(): ...@@ -132,7 +130,8 @@ def get_parser():
"can be loaded using tensorizer directly to the GPU " "can be loaded using tensorizer directly to the GPU "
"extremely quickly. Tensor encryption and decryption is " "extremely quickly. Tensor encryption and decryption is "
"also supported, although libsodium must be installed to " "also supported, although libsodium must be installed to "
"use it.") "use it."
)
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
parser.add_argument( parser.add_argument(
...@@ -144,13 +143,14 @@ def get_parser(): ...@@ -144,13 +143,14 @@ def get_parser():
"along with the model by instantiating a TensorizerConfig object, " "along with the model by instantiating a TensorizerConfig object, "
"creating a dict from it with TensorizerConfig.to_serializable(), " "creating a dict from it with TensorizerConfig.to_serializable(), "
"and passing it to LoRARequest's initializer with the kwarg " "and passing it to LoRARequest's initializer with the kwarg "
"tensorizer_config_dict." "tensorizer_config_dict.",
) )
subparsers = parser.add_subparsers(dest='command', required=True) subparsers = parser.add_subparsers(dest="command", required=True)
serialize_parser = subparsers.add_parser( serialize_parser = subparsers.add_parser(
'serialize', help="Serialize a model to `--serialized-directory`") "serialize", help="Serialize a model to `--serialized-directory`"
)
serialize_parser.add_argument( serialize_parser.add_argument(
"--suffix", "--suffix",
...@@ -163,7 +163,9 @@ def get_parser(): ...@@ -163,7 +163,9 @@ def get_parser():
"`--suffix` is `v1`, the serialized model tensors will be " "`--suffix` is `v1`, the serialized model tensors will be "
"saved to " "saved to "
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. " "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
"If none is provided, a random UUID will be used.")) "If none is provided, a random UUID will be used."
),
)
serialize_parser.add_argument( serialize_parser.add_argument(
"--serialized-directory", "--serialized-directory",
type=str, type=str,
...@@ -175,108 +177,127 @@ def get_parser(): ...@@ -175,108 +177,127 @@ def get_parser():
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will " "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, " "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
"where `suffix` is given by `--suffix` or a random UUID if not " "where `suffix` is given by `--suffix` or a random UUID if not "
"provided.") "provided.",
)
serialize_parser.add_argument( serialize_parser.add_argument(
"--serialization-kwargs", "--serialization-kwargs",
type=tensorizer_kwargs_arg, type=tensorizer_kwargs_arg,
required=False, required=False,
help=("A JSON string containing additional keyword arguments to " help=(
"pass to Tensorizer's TensorSerializer during " "A JSON string containing additional keyword arguments to "
"serialization.")) "pass to Tensorizer's TensorSerializer during "
"serialization."
),
)
serialize_parser.add_argument( serialize_parser.add_argument(
"--keyfile", "--keyfile",
type=str, type=str,
required=False, required=False,
help=("Encrypt the model weights with a randomly-generated binary key," help=(
" and save the key at this path")) "Encrypt the model weights with a randomly-generated binary key,"
" and save the key at this path"
),
)
deserialize_parser = subparsers.add_parser( deserialize_parser = subparsers.add_parser(
'deserialize', "deserialize",
help=("Deserialize a model from `--path-to-tensors`" help=(
" to verify it can be loaded and used.")) "Deserialize a model from `--path-to-tensors`"
" to verify it can be loaded and used."
),
)
deserialize_parser.add_argument( deserialize_parser.add_argument(
"--path-to-tensors", "--path-to-tensors",
type=str, type=str,
required=False, required=False,
help="The local path or S3 URI to the model tensors to deserialize. ") help="The local path or S3 URI to the model tensors to deserialize. ",
)
deserialize_parser.add_argument( deserialize_parser.add_argument(
"--serialized-directory", "--serialized-directory",
type=str, type=str,
required=False, required=False,
help="Directory with model artifacts for loading. Assumes a " help="Directory with model artifacts for loading. Assumes a "
"model.tensors file exists therein. Can supersede " "model.tensors file exists therein. Can supersede "
"--path-to-tensors.") "--path-to-tensors.",
)
deserialize_parser.add_argument( deserialize_parser.add_argument(
"--keyfile", "--keyfile",
type=str, type=str,
required=False, required=False,
help=("Path to a binary key to use to decrypt the model weights," help=(
" if the model was serialized with encryption")) "Path to a binary key to use to decrypt the model weights,"
" if the model was serialized with encryption"
),
)
deserialize_parser.add_argument( deserialize_parser.add_argument(
"--deserialization-kwargs", "--deserialization-kwargs",
type=tensorizer_kwargs_arg, type=tensorizer_kwargs_arg,
required=False, required=False,
help=("A JSON string containing additional keyword arguments to " help=(
"pass to Tensorizer's `TensorDeserializer` during " "A JSON string containing additional keyword arguments to "
"deserialization.")) "pass to Tensorizer's `TensorDeserializer` during "
"deserialization."
),
)
TensorizerArgs.add_cli_args(deserialize_parser) TensorizerArgs.add_cli_args(deserialize_parser)
return parser return parser
def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
cfg: TensorizerConfig): def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
for k, v in extra_cfg.items(): for k, v in extra_cfg.items():
if hasattr(cfg, k): if hasattr(cfg, k):
setattr(cfg, k, v) setattr(cfg, k, v)
logger.info( logger.info(
"Updating TensorizerConfig with %s from " "Updating TensorizerConfig with %s from "
"--model-loader-extra-config provided", k "--model-loader-extra-config provided",
k,
) )
def deserialize(args, tensorizer_config): def deserialize(args, tensorizer_config):
if args.lora_path: if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
llm = LLM(model=args.model, llm = LLM(
load_format="tensorizer", model=args.model,
tensor_parallel_size=args.tensor_parallel_size, load_format="tensorizer",
model_loader_extra_config=tensorizer_config, tensor_parallel_size=args.tensor_parallel_size,
enable_lora=True, model_loader_extra_config=tensorizer_config,
enable_lora=True,
) )
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0, max_tokens=256, stop=["[/assistant]"]
max_tokens=256,
stop=["[/assistant]"]
) )
# Truncating this as the extra text isn't necessary # Truncating this as the extra text isn't necessary
prompts = [ prompts = ["[user] Write a SQL query to answer the question based on ..."]
"[user] Write a SQL query to answer the question based on ..."
]
# Test LoRA load # Test LoRA load
print( print(
llm.generate( llm.generate(
prompts, prompts,
sampling_params, sampling_params,
lora_request=LoRARequest("sql-lora", lora_request=LoRARequest(
1, "sql-lora",
args.lora_path, 1,
tensorizer_config_dict = tensorizer_config args.lora_path,
.to_serializable()) tensorizer_config_dict=tensorizer_config.to_serializable(),
),
) )
) )
else: else:
llm = LLM(model=args.model, llm = LLM(
load_format="tensorizer", model=args.model,
tensor_parallel_size=args.tensor_parallel_size, load_format="tensorizer",
model_loader_extra_config=tensorizer_config tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config,
) )
return llm return llm
...@@ -285,17 +306,20 @@ def main(): ...@@ -285,17 +306,20 @@ def main():
parser = get_parser() parser = get_parser()
args = parser.parse_args() args = parser.parse_args()
s3_access_key_id = (getattr(args, 's3_access_key_id', None) s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
or os.environ.get("S3_ACCESS_KEY_ID", None)) "S3_ACCESS_KEY_ID", None
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None) )
or os.environ.get("S3_SECRET_ACCESS_KEY", None)) s3_secret_access_key = getattr(
s3_endpoint = (getattr(args, 's3_endpoint', None) args, "s3_secret_access_key", None
or os.environ.get("S3_ENDPOINT_URL", None)) ) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
"S3_ENDPOINT_URL", None
)
credentials = { credentials = {
"s3_access_key_id": s3_access_key_id, "s3_access_key_id": s3_access_key_id,
"s3_secret_access_key": s3_secret_access_key, "s3_secret_access_key": s3_secret_access_key,
"s3_endpoint": s3_endpoint "s3_endpoint": s3_endpoint,
} }
model_ref = args.model model_ref = args.model
...@@ -309,25 +333,25 @@ def main(): ...@@ -309,25 +333,25 @@ def main():
if args.model_loader_extra_config: if args.model_loader_extra_config:
extra_config = json.loads(args.model_loader_extra_config) extra_config = json.loads(args.model_loader_extra_config)
tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
tensorizer_dir = (args.serialized_directory or tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
extra_config.get("tensorizer_dir")) "tensorizer_uri"
tensorizer_uri = (getattr(args, "path_to_tensors", None) )
or extra_config.get("tensorizer_uri"))
if tensorizer_dir and tensorizer_uri: if tensorizer_dir and tensorizer_uri:
parser.error("--serialized-directory and --path-to-tensors " parser.error(
"cannot both be provided") "--serialized-directory and --path-to-tensors cannot both be provided"
)
if not tensorizer_dir and not tensorizer_uri: if not tensorizer_dir and not tensorizer_uri:
parser.error("Either --serialized-directory or --path-to-tensors " parser.error(
"must be provided") "Either --serialized-directory or --path-to-tensors must be provided"
)
if args.command == "serialize": if args.command == "serialize":
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
input_dir = tensorizer_dir.rstrip('/') input_dir = tensorizer_dir.rstrip("/")
suffix = args.suffix if args.suffix else uuid.uuid4().hex suffix = args.suffix if args.suffix else uuid.uuid4().hex
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}" base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
if engine_args.tensor_parallel_size > 1: if engine_args.tensor_parallel_size > 1:
...@@ -339,15 +363,14 @@ def main(): ...@@ -339,15 +363,14 @@ def main():
tensorizer_uri=model_path, tensorizer_uri=model_path,
encryption_keyfile=keyfile, encryption_keyfile=keyfile,
serialization_kwargs=args.serialization_kwargs or {}, serialization_kwargs=args.serialization_kwargs or {},
**credentials **credentials,
) )
if args.lora_path: if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensorize_lora_adapter(args.lora_path, tensorizer_config) tensorize_lora_adapter(args.lora_path, tensorizer_config)
merge_extra_config_with_tensorizer_config(extra_config, merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
tensorizer_config)
tensorize_vllm_model(engine_args, tensorizer_config) tensorize_vllm_model(engine_args, tensorizer_config)
elif args.command == "deserialize": elif args.command == "deserialize":
...@@ -356,11 +379,10 @@ def main(): ...@@ -356,11 +379,10 @@ def main():
tensorizer_dir=args.serialized_directory, tensorizer_dir=args.serialized_directory,
encryption_keyfile=keyfile, encryption_keyfile=keyfile,
deserialization_kwargs=args.deserialization_kwargs or {}, deserialization_kwargs=args.deserialization_kwargs or {},
**credentials **credentials,
) )
merge_extra_config_with_tensorizer_config(extra_config, merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
tensorizer_config)
deserialize(args, tensorizer_config) deserialize(args, tensorizer_config)
else: else:
raise ValueError("Either serialize or deserialize must be specified.") raise ValueError("Either serialize or deserialize must be specified.")
......
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.lint.isort]
known-first-party = ["vllm"]
[tool.ruff.format]
docstring-code-format = true
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment