Commit 53076d70 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-ori

parents 322a0be6 9c5c81b0
......@@ -29,6 +29,15 @@ completion = client.chat.completions.create(
print(completion.choices[0].message)
```
:::{tip}
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
:::
:::{important}
By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
To disable this behavior, please pass `--generation-config vllm` when launching the server.
:::
## Supported APIs
We currently support the following OpenAI APIs:
......
......@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
engine_args = EngineArgs(model=model,
quantization=quantization,
qlora_adapter_name_or_path=lora_repo,
load_format="bitsandbytes",
enable_lora=True,
max_lora_rank=64)
else:
......
......@@ -6,14 +6,16 @@ import argparse
from vllm import LLM
from vllm.sampling_params import SamplingParams
# This script is an offline demo for running Mistral-Small-3
# This script is an offline demo for running Mistral-Small-3.1
#
# If you want to run a server/client setup, please follow this code:
#
# - Server:
#
# ```bash
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
# ```
#
# - Client:
......@@ -51,6 +53,8 @@ def run_simple_demo(args: argparse.Namespace):
llm = LLM(
model=model_name,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
......@@ -91,6 +95,8 @@ def run_advanced_demo(args: argparse.Namespace):
llm = LLM(
model=model_name,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
limit_mm_per_prompt={"image": max_img_per_msg},
max_model_len=max_img_per_msg * max_tokens_per_img,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
......
......@@ -50,7 +50,9 @@ if __name__ == "__main__":
# Create an LLM with spec decoding
llm = LLM(
model="meta-llama/Llama-2-13b-chat-hf",
speculative_model="ibm-ai-platform/llama-13b-accelerator",
speculative_config={
"model": "ibm-ai-platform/llama-13b-accelerator",
},
)
print("With speculation")
......
# SPDX-License-Identifier: Apache-2.0
import os
from vllm import LLM, SamplingParams
# vLLM does not guarantee the reproducibility of the results by default,
# for the sake of performance. You need to do the following to achieve
# reproducible results:
# 1. Turn off multiprocessing to make the scheduling deterministic.
# NOTE(woosuk): This is not needed and will be ignored for V0.
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
# 2. Fix the global seed for reproducibility. The default seed is None, which is
# not reproducible.
SEED = 42
# NOTE(woosuk): Even with the above two settings, vLLM only provides
# reproducibility when it runs on the same hardware and the same vLLM version.
# Also, the online serving API (`vllm serve`) does not support reproducibility
# because it is almost impossible to make the scheduling deterministic in the
# online serving setting.
llm = LLM(model="facebook/opt-125m", seed=SEED)
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
......@@ -169,7 +169,6 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
max_model_len=2048,
max_num_seqs=2,
# Default is False; setting it to True is not supported in V1 yet
mm_processor_kwargs={"do_pan_and_scan": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
......
......@@ -91,8 +91,6 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
model=model_name,
max_model_len=8192,
max_num_seqs=2,
# Default is False; setting it to True is not supported in V1 yet
mm_processor_kwargs={"do_pan_and_scan": True},
limit_mm_per_prompt={"image": len(image_urls)},
)
......
......@@ -8,6 +8,9 @@ set -xe
echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
sleep 1
# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'cleanup' INT
......@@ -44,18 +47,20 @@ wait_for_server() {
# You can also adjust --kv-ip and --kv-port for distributed inference.
# prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
--port 8100 \
--max-model-len 100 \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
# decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
--port 8200 \
--max-model-len 100 \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
......@@ -78,7 +83,7 @@ sleep 1
output1=$(curl -X POST -s http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "'"$MODEL_NAME"'",
"prompt": "San Francisco is a",
"max_tokens": 10,
"temperature": 0
......@@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
output2=$(curl -X POST -s http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "'"$MODEL_NAME"'",
"prompt": "Santa Clara is a",
"max_tokens": 10,
"temperature": 0
......
# SPDX-License-Identifier: Apache-2.0
"""
An example demonstrates how to use tool calling with reasoning models
like QwQ-32B. The reasoning_content will not be parsed by the tool
calling process; only the final output will be parsed.
To run this example, you need to start the vLLM server with both
the reasoning parser and tool calling enabled.
```bash
vllm serve Qwen/QwQ-32B \
--enable-reasoning --reasoning-parser deepseek_r1 \
--enable-auto-tool-choice --tool-call-parser hermes
```
"""
from openai import OpenAI
# Now, simulate a tool call
def get_current_weather(city: str, state: str, unit: 'str'):
return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's.")
available_tools = {"get_current_weather": get_current_weather}
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for, e.g. 'San Francisco'"
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["city", "state", "unit"]
}
}
}]
messages = [{
"role": "user",
"content": "Hi! How are you doing today?"
}, {
"role": "assistant",
"content": "I'm doing well! How can I help you?"
}, {
"role":
"user",
"content":
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
def extract_reasoning_and_calls(chunks: list):
reasoning_content = ""
tool_call_idx = -1
arguments = []
function_names = []
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
function_names.append("")
if tool_call.function:
if tool_call.function.name:
function_names[tool_call_idx] = tool_call.function.name
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
else:
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content += chunk.choices[0].delta.reasoning_content
return reasoning_content, arguments, function_names
print("---------Full Generate With Automatic Function Calling-------------")
tool_calls = client.chat.completions.create(messages=messages,
model=model,
tools=tools)
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
print(f"function name: "
f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
print(f"function arguments: "
f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
print("----------Stream Generate With Automatic Function Calling-----------")
tool_calls_stream = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks.append(chunk)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("----------Full Generate With Named Function Calling-----------------")
tool_calls = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice={
"type": "function",
"function": {
"name":
"get_current_weather"
}
})
tool_call = tool_calls.choices[0].message.tool_calls[0].function
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
print(f"function name: {tool_call.name}")
print(f"function arguments: {tool_call.arguments}")
print("----------Stream Generate With Named Function Calling--------------")
tool_calls_stream = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice={
"type": "function",
"function": {
"name": "get_current_weather"
}
},
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks.append(chunk)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("\n\n")
{%- for message in messages %}
{%- if message['role'] == 'user' %}
{{- '<_user>' + message['content']|trim }}
{%- elif message['role'] == 'system' %}
{{- '<_system>' + message['content']|trim }}
{%- elif message['role'] == 'assistant' %}
{{- '<_bot>' + message['content'] }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<_bot>' }}
{%- endif %}
......@@ -18,6 +18,7 @@ pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11
lark == 1.2.2
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
......
......@@ -3,7 +3,8 @@
# Dependencies for CPUs
torch==2.6.0+cpu; platform_machine == "x86_64"
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
torch==2.6.0; platform_system == "Darwin"
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64"
torch==2.7.0.dev20250304; platform_machine == "s390x"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
......
# Common dependencies
-r common.txt
torch == 2.5.1 # should be aligned with "common" vLLM torch version
openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
# Common dependencies
-r common.txt
--extra-index-url https://download.pytorch.org/whl/rocm6.2
torch==2.5.1
torchvision==0.20.1
torchaudio==2.5.1
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
torch==2.6.0
torchvision==0.21.0
torchaudio==2.6.0
cmake>=3.26
packaging
......
......@@ -30,7 +30,7 @@ matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test
transformers==4.48.2
transformers==4.48.2
# quantization
bitsandbytes>=0.45.3
buildkite-test-collector==0.1.9
......
......@@ -17,9 +17,9 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
......@@ -449,10 +449,6 @@ def _is_cpu() -> bool:
return VLLM_TARGET_DEVICE == "cpu"
def _is_openvino() -> bool:
return VLLM_TARGET_DEVICE == "openvino"
def _is_xpu() -> bool:
return VLLM_TARGET_DEVICE == "xpu"
......@@ -572,8 +568,6 @@ def get_vllm_version() -> str:
if gaudi_sw_version != MAIN_CUDA_VERSION:
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
version += f"{sep}gaudi{gaudi_sw_version}"
elif _is_openvino():
version += f"{sep}openvino"
elif _is_tpu():
version += f"{sep}tpu"
elif _is_cpu():
......@@ -623,8 +617,6 @@ def get_requirements() -> list[str]:
requirements = _read_requirements("neuron.txt")
elif _is_hpu():
requirements = _read_requirements("hpu.txt")
elif _is_openvino():
requirements = _read_requirements("openvino.txt")
elif _is_tpu():
requirements = _read_requirements("tpu.txt")
elif _is_cpu():
......@@ -634,7 +626,7 @@ def get_requirements() -> list[str]:
else:
raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
"OpenVINO, or CPU.")
"or CPU.")
return requirements
......
......@@ -4,34 +4,38 @@ import pickle
import pytest
import torch
from torch._inductor.codecache import BypassFxGraphCache
from vllm.compilation.config import CompilationConfig
from vllm.compilation.inductor_pass import (CallableInductorPass,
as_inductor_pass)
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.compilation.pass_manager import PostGradPassManager
from vllm.config import CompilationConfig
def simple_callable(graph: torch.fx.Graph):
pass
@as_inductor_pass(files=(__file__, ))
def callable_decorated(graph: torch.fx.Graph):
pass
callable_uuid = CallableInductorPass(simple_callable,
InductorPass.hash_source(__file__))
@pytest.mark.parametrize(
"works, callable",
[(False, simple_callable), (True, callable_decorated),
(True, CallableInductorPass(simple_callable, "simple_callable"))])
[
(False, simple_callable),
(True, callable_uuid),
(True, CallableInductorPass(simple_callable)),
],
)
def test_pass_manager(works: bool, callable):
config = CompilationConfig().pass_config
pass_manager = PostGradPassManager([callable])
pass_manager.configure(config) # Adds default passes
pass_manager = PostGradPassManager()
pass_manager.configure(config)
# Try to add the callable to the pass manager
if works:
pass_manager.add(callable)
pickle.dumps(pass_manager)
else:
with pytest.raises(BypassFxGraphCache):
pickle.dumps(pass_manager)
with pytest.raises(AssertionError):
pass_manager.add(callable)
......@@ -273,8 +273,7 @@ class HfRunner:
def get_default_device(self):
from vllm.platforms import current_platform
return ("cpu" if current_platform.is_cpu()
or current_platform.is_openvino() else "cuda")
return ("cpu" if current_platform.is_cpu() else "cuda")
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
if x is None or isinstance(x, (bool, )):
......
......@@ -9,6 +9,8 @@ import torch.distributed as dist
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import get_world_group
dist.init_process_group(backend="gloo")
# Create prompts
prompts = [
"Hello, my name is",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment