Commit fcfc474d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.3' into v0.8.3-dev

parents bb94d2e5 296c6572
...@@ -60,6 +60,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData: ...@@ -60,6 +60,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
) )
# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "CohereForAI/aya-vision-8b"
engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# BLIP-2 # BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData: def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -68,7 +90,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -68,7 +90,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompts = [f"Question: {question} Answer:" for question in questions] prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="Salesforce/blip2-opt-2.7b", model="Salesforce/blip2-opt-6.7b",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
...@@ -128,7 +150,8 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -128,7 +150,8 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="microsoft/Florence-2-large", model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large", tokenizer="facebook/bart-large",
max_num_seqs=8, max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
...@@ -361,6 +384,7 @@ def run_llava_next_video(questions: list[str], ...@@ -361,6 +384,7 @@ def run_llava_next_video(questions: list[str],
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/LLaVA-NeXT-Video-7B-hf", model="llava-hf/LLaVA-NeXT-Video-7B-hf",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
...@@ -496,6 +520,29 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData: ...@@ -496,6 +520,29 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6") return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
# NOTE: Need L40 (or equivalent) to avoid OOM
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
tensor_parallel_size=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LLama 3.2 # LLama 3.2
def run_mllama(questions: list[str], modality: str) -> ModelRequestData: def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -510,7 +557,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ...@@ -510,7 +557,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=16, max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
...@@ -535,6 +582,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ...@@ -535,6 +582,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
) )
def run_llama4(questions: list[str], modality: str):
assert modality == "image"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
gpu_memory_utilization=0.4,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [[{
"role":
"user",
"content": [{
"type": "image"
}, {
"type": "text",
"text": f"{question}"
}]
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
add_generation_prompt=True,
tokenize=False)
stop_token_ids = None
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# Molmo # Molmo
def run_molmo(questions: list[str], modality: str) -> ModelRequestData: def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -699,7 +782,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData: ...@@ -699,7 +782,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
# NOTE: Need L40 (or equivalent) to avoid OOM # NOTE: Need L40 (or equivalent) to avoid OOM
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=6144,
max_num_seqs=2, max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
...@@ -803,8 +886,44 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -803,8 +886,44 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
) )
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "Skywork/Skywork-R1V-38B"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [[{
'role': 'user',
'content': f"<image>\n{question}"
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for SkyworkR1V
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
model_example_map = { model_example_map = {
"aria": run_aria, "aria": run_aria,
"aya_vision": run_aya_vision,
"blip-2": run_blip2, "blip-2": run_blip2,
"chameleon": run_chameleon, "chameleon": run_chameleon,
"deepseek_vl_v2": run_deepseek_vl2, "deepseek_vl_v2": run_deepseek_vl2,
...@@ -822,7 +941,9 @@ model_example_map = { ...@@ -822,7 +941,9 @@ model_example_map = {
"mantis": run_mantis, "mantis": run_mantis,
"minicpmo": run_minicpmo, "minicpmo": run_minicpmo,
"minicpmv": run_minicpmv, "minicpmv": run_minicpmv,
"mistral3": run_mistral3,
"mllama": run_mllama, "mllama": run_mllama,
"llama4": run_llama4,
"molmo": run_molmo, "molmo": run_molmo,
"NVLM_D": run_nvlm_d, "NVLM_D": run_nvlm_d,
"paligemma": run_paligemma, "paligemma": run_paligemma,
...@@ -833,6 +954,7 @@ model_example_map = { ...@@ -833,6 +954,7 @@ model_example_map = {
"qwen_vl": run_qwen_vl, "qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl, "qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl, "qwen2_5_vl": run_qwen2_5_vl,
"skywork_chat": run_skyworkr1v,
} }
......
...@@ -61,6 +61,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -61,6 +61,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "CohereForAI/aya-vision-8b"
engine_args = EngineArgs(
model=model_name,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_deepseek_vl2(question: str, def load_deepseek_vl2(question: str,
image_urls: list[str]) -> ModelRequestData: image_urls: list[str]) -> ModelRequestData:
model_name = "deepseek-ai/deepseek-vl2-tiny" model_name = "deepseek-ai/deepseek-vl2-tiny"
...@@ -218,6 +253,65 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -218,6 +253,65 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
# Adjust this as necessary to fit in GPU
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
tensor_parallel_size=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "[IMG]" * len(image_urls)
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
...@@ -229,8 +323,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -229,8 +323,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
placeholders = "<|image|>" * len(image_urls) img_prompt = "Given the first image <|image|> and the second image<|image|>"
prompt = f"{placeholders}<|begin_of_text|>{question}" prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
prompt=prompt, prompt=prompt,
...@@ -504,11 +598,14 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -504,11 +598,14 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_example_map = { model_example_map = {
"aria": load_aria, "aria": load_aria,
"aya_vision": load_aya_vision,
"deepseek_vl_v2": load_deepseek_vl2, "deepseek_vl_v2": load_deepseek_vl2,
"gemma3": load_gemma3, "gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl, "h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3, "idefics3": load_idefics3,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"llama4": load_llama4,
"mistral3": load_mistral3,
"mllama": load_mllama, "mllama": load_mllama,
"NVLM_D": load_nvlm_d, "NVLM_D": load_nvlm_d,
"phi3_v": load_phi3v, "phi3_v": load_phi3v,
......
# SPDX-License-Identifier: Apache-2.0
"""
This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling.
We can launch multiple vllm instances (2 for prefill and 2 for decode), and
launch this proxy demo through:
python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py \
--model $model_name \
--prefill localhost:8100 localhost:8101 \
--decode localhost:8200 localhost:8201 \
--port 8000
Note: This demo will be removed once the PDController implemented in PR 15343
(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
"""
import argparse
import ipaddress
import itertools
import json
import logging
import os
import sys
from abc import ABC, abstractmethod
from typing import Callable, Optional
import aiohttp
import requests
import uvicorn
from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException,
Request, status)
from fastapi.responses import JSONResponse, StreamingResponse
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
class SchedulingPolicy(ABC):
@abstractmethod
def schedule(self, cycler: itertools.cycle):
raise NotImplementedError("Scheduling Proxy is not set.")
class Proxy:
def __init__(
self,
prefill_instances: list[str],
decode_instances: list[str],
model: str,
scheduling_policy: SchedulingPolicy,
custom_create_completion: Optional[Callable[[Request],
StreamingResponse]] = None,
custom_create_chat_completion: Optional[Callable[
[Request], StreamingResponse]] = None,
):
self.prefill_instances = prefill_instances
self.decode_instances = decode_instances
self.prefill_cycler = itertools.cycle(prefill_instances)
self.decode_cycler = itertools.cycle(decode_instances)
self.model = model
self.scheduling_policy = scheduling_policy
self.custom_create_completion = custom_create_completion
self.custom_create_chat_completion = custom_create_chat_completion
self.router = APIRouter()
self.setup_routes()
def setup_routes(self):
self.router.post(
"/v1/completions",
dependencies=[
Depends(self.validate_json_request)
])(self.custom_create_completion if self.
custom_create_completion else self.create_completion)
self.router.post(
"/v1/chat/completions",
dependencies=[
Depends(self.validate_json_request)
])(self.custom_create_chat_completion if self.
custom_create_chat_completion else self.create_chat_completion)
self.router.get("/status",
response_class=JSONResponse)(self.get_status)
self.router.post("/instances/add",
dependencies=[Depends(self.api_key_authenticate)
])(self.add_instance_endpoint)
async def validate_json_request(self, raw_request: Request):
content_type = raw_request.headers.get("content-type", "").lower()
if content_type != "application/json":
raise HTTPException(
status_code=415,
detail=
"Unsupported Media Type: Only 'application/json' is allowed",
)
def api_key_authenticate(self, x_api_key: str = Header(...)):
expected_api_key = os.environ.get("ADMIN_API_KEY")
if not expected_api_key:
logger.error("ADMIN_API_KEY is not set in the environment.")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Server configuration error.",
)
if x_api_key != expected_api_key:
logger.warning("Unauthorized access attempt with API Key: %s",
x_api_key)
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Forbidden: Invalid API Key.",
)
async def validate_instance(self, instance: str) -> bool:
url = f"http://{instance}/v1/models"
try:
async with aiohttp.ClientSession(
timeout=AIOHTTP_TIMEOUT) as client:
logger.info("Verifying %s ...", instance)
async with client.get(url) as response:
if response.status == 200:
data = await response.json()
if "data" in data and len(data["data"]) > 0:
model_cur = data["data"][0].get("id", "")
if model_cur == self.model:
logger.info("Instance: %s could be added.",
instance)
return True
else:
logger.warning("Mismatch model %s : %s != %s",
instance, model_cur, self.model)
return False
else:
return False
else:
return False
except aiohttp.ClientError as e:
logger.error(str(e))
return False
except Exception as e:
logger.error(str(e))
return False
async def add_instance_endpoint(self, request: Request):
try:
data = await request.json()
logger.warning(str(data))
instance_type = data.get("type")
instance = data.get("instance")
if instance_type not in ["prefill", "decode"]:
raise HTTPException(status_code=400,
detail="Invalid instance type.")
if not instance or ":" not in instance:
raise HTTPException(status_code=400,
detail="Invalid instance format.")
host, port_str = instance.split(":")
try:
if host != "localhost":
ipaddress.ip_address(host)
port = int(port_str)
if not (0 < port < 65536):
raise HTTPException(status_code=400,
detail="Invalid port number.")
except Exception as e:
raise HTTPException(status_code=400,
detail="Invalid instance address.") from e
is_valid = await self.validate_instance(instance)
if not is_valid:
raise HTTPException(status_code=400,
detail="Instance validation failed.")
if instance_type == "prefill":
if instance not in self.prefill_instances:
self.prefill_instances.append(instance)
self.prefill_cycler = itertools.cycle(
self.prefill_instances)
else:
raise HTTPException(status_code=400,
detail="Instance already exists.")
else:
if instance not in self.decode_instances:
self.decode_instances.append(instance)
self.decode_cycler = itertools.cycle(self.decode_instances)
else:
raise HTTPException(status_code=400,
detail="Instance already exists.")
return JSONResponse(content={
"message":
f"Added {instance} to {instance_type}_instances."
})
except HTTPException as http_exc:
raise http_exc
except Exception as e:
logger.error("Error in add_instance_endpoint: %s", str(e))
raise HTTPException(status_code=500, detail=str(e)) from e
async def forward_request(self, url, data, use_chunked=True):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
try:
async with session.post(url=url, json=data,
headers=headers) as response:
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501
if use_chunked:
async for chunk_bytes in response.content.iter_chunked( # noqa: E501
1024):
yield chunk_bytes
else:
content = await response.read()
yield content
else:
error_content = await response.text()
try:
error_content = json.loads(error_content)
except json.JSONDecodeError:
error_content = error_content
logger.error("Request failed with status %s: %s",
response.status, error_content)
raise HTTPException(
status_code=response.status,
detail=
f"Request failed with status {response.status}: "
f"{error_content}",
)
except aiohttp.ClientError as e:
logger.error("ClientError occurred: %s", str(e))
raise HTTPException(
status_code=502,
detail=
"Bad Gateway: Error communicating with upstream server.",
) from e
except Exception as e:
logger.error("Unexpected error: %s", str(e))
raise HTTPException(status_code=500, detail=str(e)) from e
def schedule(self, cycler: itertools.cycle) -> str:
return self.scheduling_policy.schedule(cycler)
async def get_status(self):
status = {
"prefill_node_count": len(self.prefill_instances),
"decode_node_count": len(self.decode_instances),
"prefill_nodes": self.prefill_instances,
"decode_nodes": self.decode_instances,
}
return status
async def create_completion(self, raw_request: Request):
try:
request = await raw_request.json()
kv_prepare_request = request.copy()
kv_prepare_request["max_tokens"] = 1
prefill_instance = self.schedule(self.prefill_cycler)
try:
async for _ in self.forward_request(
f"http://{prefill_instance}/v1/completions",
kv_prepare_request):
continue
except HTTPException as http_exc:
self.remove_instance_endpoint("prefill", prefill_instance)
raise http_exc
# Perform kv recv and decoding stage
decode_instance = self.schedule(self.decode_cycler)
try:
generator = self.forward_request(
f"http://{decode_instance}/v1/completions", request)
except HTTPException as http_exc:
self.remove_instance_endpoint("decode", decode_instance)
raise http_exc
response = StreamingResponse(generator)
return response
except Exception:
import sys
exc_info = sys.exc_info()
print("Error occurred in disagg proxy server")
print(exc_info)
async def create_chat_completion(self, raw_request: Request):
try:
request = await raw_request.json()
# add params to request
kv_prepare_request = request.copy()
kv_prepare_request["max_tokens"] = 1
# prefill stage
prefill_instance = self.schedule(self.prefill_cycler)
try:
async for _ in self.forward_request(
f"http://{prefill_instance}/v1/chat/completions",
kv_prepare_request):
continue
except HTTPException as http_exc:
self.remove_instance_endpoint("prefill", prefill_instance)
raise http_exc
# Perform kv recv and decoding stage
decode_instance = self.schedule(self.decode_cycler)
try:
generator = self.forward_request(
"http://" + decode_instance + "/v1/chat/completions",
request)
except HTTPException as http_exc:
self.remove_instance_endpoint("decode", decode_instance)
raise http_exc
response = StreamingResponse(content=generator)
return response
except Exception:
exc_info = sys.exc_info()
error_messages = [str(e) for e in exc_info if e]
print("Error occurred in disagg proxy server")
print(error_messages)
return StreamingResponse(content=iter(error_messages),
media_type="text/event-stream")
def remove_instance_endpoint(self, instance_type, instance):
if (instance_type == "decode" and instance in self.decode_instances):
self.decode_instances.remove(instance)
self.decode_cycler = itertools.cycle(self.decode_instances)
if (instance_type == "prefill" and instance in self.decode_instances):
self.prefill_instances.remove(instance)
self.prefill_cycler = itertools.cycle(self.decode_instances)
class RoundRobinSchedulingPolicy(SchedulingPolicy):
def __init__(self):
super().__init__()
def schedule(self, cycler: itertools.cycle) -> str:
return next(cycler)
class ProxyServer:
def __init__(
self,
args: argparse.Namespace,
scheduling_policy: Optional[SchedulingPolicy] = None,
create_completion: Optional[Callable[[Request],
StreamingResponse]] = None,
create_chat_completion: Optional[Callable[[Request],
StreamingResponse]] = None,
):
self.validate_parsed_serve_args(args)
self.port = args.port
self.proxy_instance = Proxy(
prefill_instances=[] if args.prefill is None else args.prefill,
decode_instances=[] if args.decode is None else args.decode,
model=args.model,
scheduling_policy=(scheduling_policy if scheduling_policy
is not None else RoundRobinSchedulingPolicy()),
custom_create_completion=create_completion,
custom_create_chat_completion=create_chat_completion,
)
def validate_parsed_serve_args(self, args: argparse.Namespace):
if not args.prefill:
raise ValueError("Please specify at least one prefill node.")
if not args.decode:
raise ValueError("Please specify at least one decode node.")
self.validate_instances(args.prefill)
self.validate_instances(args.decode)
self.verify_model_config(args.prefill, args.model)
self.verify_model_config(args.decode, args.model)
def validate_instances(self, instances: list):
for instance in instances:
if len(instance.split(":")) != 2:
raise ValueError(f"Invalid instance format: {instance}")
host, port = instance.split(":")
try:
if host != "localhost":
ipaddress.ip_address(host)
port = int(port)
if not (0 < port < 65536):
raise ValueError(
f"Invalid port number in instance: {instance}")
except Exception as e:
raise ValueError(
f"Invalid instance {instance}: {str(e)}") from e
def verify_model_config(self, instances: list, model: str) -> None:
model_suffix = model.split("/")[-1]
for instance in instances:
try:
response = requests.get(f"http://{instance}/v1/models")
if response.status_code == 200:
model_cur = response.json()["data"][0]["id"]
model_cur_suffix = model_cur.split("/")[-1]
if model_cur_suffix != model_suffix:
raise ValueError(
f"{instance} serves a different model: "
f"{model_cur} != {model}")
else:
raise ValueError(f"Cannot get model id from {instance}!")
except requests.RequestException as e:
raise ValueError(
f"Error communicating with {instance}: {str(e)}") from e
def run_server(self):
app = FastAPI()
app.include_router(self.proxy_instance.router)
config = uvicorn.Config(app, port=self.port, loop="uvloop")
server = uvicorn.Server(config)
server.run()
if __name__ == "__main__":
# Todo: allow more config
parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
parser.add_argument("--model",
"-m",
type=str,
required=True,
help="Model name")
parser.add_argument(
"--prefill",
"-p",
type=str,
nargs="+",
help="List of prefill node URLs (host:port)",
)
parser.add_argument(
"--decode",
"-d",
type=str,
nargs="+",
help="List of decode node URLs (host:port)",
)
parser.add_argument(
"--port",
type=int,
default=8000,
help="Server port number",
)
args = parser.parse_args()
proxy_server = ProxyServer(args=args)
proxy_server.run_server()
# SPDX-License-Identifier: Apache-2.0
"""
To run this example, you can start the vLLM server
without any specific flags:
```bash
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--guided-decoding-backend outlines
```
This example demonstrates how to generate chat completions
using the OpenAI Python client library.
"""
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for"
", e.g. 'San Francisco'",
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the "
"city is in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "get_forecast",
"description": "Get the weather forecast for a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to get the forecast for, e.g. 'New York'",
},
"state": {
"type":
"string",
"description":
"The two-letter abbreviation for the state, e.g. 'NY'",
},
"days": {
"type":
"integer",
"description":
"Number of days to get the forecast for (1-7)",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "days", "unit"],
},
},
},
]
messages = [
{
"role": "user",
"content": "Hi! How are you doing today?"
},
{
"role": "assistant",
"content": "I'm doing well! How can I help you?"
},
{
"role":
"user",
"content":
"Can you tell me what the current weather is in Dallas \
and the forecast for the next 5 days, in fahrenheit?",
},
]
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)
for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")
print(chat_completion.choices[0].message.tool_calls)
...@@ -31,6 +31,7 @@ model = models.data[0].id ...@@ -31,6 +31,7 @@ model = models.data[0].id
# Round 1 # Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages) response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content reasoning_content = response.choices[0].message.reasoning_content
......
...@@ -38,6 +38,7 @@ models = client.models.list() ...@@ -38,6 +38,7 @@ models = client.models.list()
model = models.data[0].id model = models.data[0].id
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model, stream = client.chat.completions.create(model=model,
messages=messages, messages=messages,
stream=True) stream=True)
......
{%- if messages %}
{%- if system_message or tools %}
<|system|>
{%- if system_message %}
{{ system_message }}
{%- endif %}
In addition to plain text responses, you can chose to call one or more of the provided functions.
Use the following rule to decide when to call a function:
* if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
* if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
If you decide to call functions:
* prefix function calls with functools marker (no closing marker required)
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
* follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
* respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
* make sure you pick the right functions that match the user intent
{%- if tools %}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}<|end|>
{%- endif %}
{%- for message in messages %}
{%- if message.role != "system" %}
<|{{ message.role }}|>
{%- if message.content and message.role == "tools" %}
{"result": {{ message.content }}}
{%- elif message.content %}
{{ message.content }}
{%- elif message.tool_calls %}
{%- for call in message.tool_calls %}
{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}}
{%- if not loop.last %},{% endif %}
{%- endfor %}
{%- endif %}<|end|>
{%- endif %}
{%- endfor %}<|assistant|>
{%- else %}
{%- if system_message %}
<|system|>
{{ system_message }}<|end|>
{%- endif %}
{%- if prompt %}
<|user|>
{{ prompt }}<|end|>
{%- endif %}<|assistant|>
{%- endif %}
{{ response }}
{%- if response %}<|user|>{% endif %}
\ No newline at end of file
#!/bin/bash #!/bin/bash
echo "vLLM linting system has been moved from format.sh to pre-commit hook." echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
echo "Please run 'pip install -r requirements/lint.txt', followed by" echo "Please run 'pip install -r requirements/lint.txt', followed by"
echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook." echo "'pre-commit install' to install the pre-commit hooks."
echo "Then linters will run automatically before each commit." echo "Then linters will run automatically before each commit."
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
TL;DR:
VLLM_USE_PRECOMPILED=1 pip install -e .
or
export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
pip install -e .
""" # noqa
print(msg)
cachetools cachetools
psutil psutil
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0 numpy
requests >= 2.26.0 requests >= 2.26.0
tqdm tqdm
blake3 blake3
py-cpuinfo py-cpuinfo
transformers >= 4.49.0 # Required for Bamba model and Transformers backend. transformers >= 4.51.0
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
tokenizers >= 0.19.1 # Required for Llama 3. tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer. protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
...@@ -21,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11 ...@@ -21,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11 outlines == 0.1.11
lark == 1.2.2 lark == 1.2.2
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64" xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs
...@@ -30,6 +31,7 @@ msgspec ...@@ -30,6 +31,7 @@ msgspec
gguf == 0.10.0 gguf == 0.10.0
importlib_metadata importlib_metadata
mistral_common[opencv] >= 1.5.4 mistral_common[opencv] >= 1.5.4
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
......
...@@ -4,14 +4,14 @@ ...@@ -4,14 +4,14 @@
# Dependencies for CPUs # Dependencies for CPUs
torch==2.6.0+cpu; platform_machine == "x86_64" torch==2.6.0+cpu; platform_machine == "x86_64"
torch==2.6.0; platform_system == "Darwin" torch==2.6.0; platform_system == "Darwin"
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
torch==2.7.0.dev20250304; platform_machine == "s390x" torch==2.7.0.dev20250304; platform_machine == "s390x"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.5.1; platform_machine == "ppc64le" torchaudio==2.6.0; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch # required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.20.1; platform_machine == "ppc64le" torchvision==0.21.0; platform_machine == "ppc64le"
datasets # for benchmark scripts datasets # for benchmark scripts
# Common dependencies # Common dependencies
-r common.txt -r common.txt
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61; python_version > '3.9'
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1. ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.6.0 torch==2.6.0
torchaudio==2.6.0 torchaudio==2.6.0
# These must be updated alongside torch # These must be updated alongside torch
......
...@@ -6,7 +6,7 @@ torch==2.6.0 ...@@ -6,7 +6,7 @@ torch==2.6.0
torchvision==0.21.0 torchvision==0.21.0
torchaudio==2.6.0 torchaudio==2.6.0
cmake>=3.26 cmake>=3.26,<4
packaging packaging
setuptools>=61 setuptools>=61
setuptools-scm>=8 setuptools-scm>=8
......
# Common dependencies # Common dependencies
-r common.txt -r common.txt
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61; python_version > '3.9'
# Dependencies for hcus # Dependencies for hcus
awscli awscli
......
...@@ -9,7 +9,6 @@ pytest-shard ...@@ -9,7 +9,6 @@ pytest-shard
# testing utils # testing utils
awscli awscli
backoff # required for phi4mm test backoff # required for phi4mm test
decord # required for video tests
einops # required for MPT, qwen-vl and Mamba einops # required for MPT, qwen-vl and Mamba
httpx httpx
librosa # required for audio tests librosa # required for audio tests
...@@ -17,7 +16,7 @@ vector_quantize_pytorch # required for minicpmo_26 test ...@@ -17,7 +16,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
vocos # required for minicpmo_26 test vocos # required for minicpmo_26 test
peft peft
pqdm pqdm
ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
sentence-transformers # required for embedding tests sentence-transformers # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
...@@ -28,9 +27,11 @@ torchvision==0.21.0 ...@@ -28,9 +27,11 @@ torchvision==0.21.0
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test mistral_common[opencv] >= 1.5.4 # required for pixtral test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test lm-eval[api]==0.4.8 # required for model evaluation test
transformers==4.48.2 transformers==4.51.0
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
# quantization # quantization
bitsandbytes>=0.45.3 bitsandbytes>=0.45.3
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
...@@ -38,7 +39,9 @@ buildkite-test-collector==0.1.9 ...@@ -38,7 +39,9 @@ buildkite-test-collector==0.1.9
genai_perf==0.0.8 genai_perf==0.0.8
tritonclient==2.51.0 tritonclient==2.51.0
numpy < 2.0.0 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61; python_version > '3.9'
numpy
runai-model-streamer==0.11.0 runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0 runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10 fastsafetensors>=0.1.10
...@@ -93,8 +93,6 @@ datasets==3.0.2 ...@@ -93,8 +93,6 @@ datasets==3.0.2
# lm-eval # lm-eval
decorator==5.1.1 decorator==5.1.1
# via librosa # via librosa
decord==0.6.0
# via -r requirements/test.in
dill==0.3.8 dill==0.3.8
# via # via
# datasets # datasets
...@@ -154,14 +152,17 @@ genson==1.3.0 ...@@ -154,14 +152,17 @@ genson==1.3.0
# via datamodel-code-generator # via datamodel-code-generator
h11==0.14.0 h11==0.14.0
# via httpcore # via httpcore
hf-xet==0.1.4
# via huggingface-hub
hiredis==3.0.0 hiredis==3.0.0
# via tensorizer # via tensorizer
httpcore==1.0.6 httpcore==1.0.6
# via httpx # via httpx
httpx==0.27.2 httpx==0.27.2
# via -r requirements/test.in # via -r requirements/test.in
huggingface-hub==0.26.2 huggingface-hub==0.30.1
# via # via
# -r requirements/test.in
# accelerate # accelerate
# datasets # datasets
# evaluate # evaluate
...@@ -219,9 +220,9 @@ libnacl==2.1.0 ...@@ -219,9 +220,9 @@ libnacl==2.1.0
# via tensorizer # via tensorizer
librosa==0.10.2.post1 librosa==0.10.2.post1
# via -r requirements/test.in # via -r requirements/test.in
llvmlite==0.43.0 llvmlite==0.44.0
# via numba # via numba
lm-eval==0.4.4 lm-eval==0.4.8
# via -r requirements/test.in # via -r requirements/test.in
lxml==5.3.0 lxml==5.3.0
# via sacrebleu # via sacrebleu
...@@ -262,8 +263,10 @@ networkx==3.2.1 ...@@ -262,8 +263,10 @@ networkx==3.2.1
# via torch # via torch
nltk==3.9.1 nltk==3.9.1
# via rouge-score # via rouge-score
numba==0.60.0 numba==0.61.0
# via librosa # via
# -r requirements/test.in
# librosa
numexpr==2.10.1 numexpr==2.10.1
# via lm-eval # via lm-eval
numpy==1.26.4 numpy==1.26.4
...@@ -274,7 +277,6 @@ numpy==1.26.4 ...@@ -274,7 +277,6 @@ numpy==1.26.4
# contourpy # contourpy
# cupy-cuda12x # cupy-cuda12x
# datasets # datasets
# decord
# einx # einx
# encodec # encodec
# evaluate # evaluate
...@@ -335,8 +337,10 @@ nvidia-nvjitlink-cu12==12.4.127 ...@@ -335,8 +337,10 @@ nvidia-nvjitlink-cu12==12.4.127
# torch # torch
nvidia-nvtx-cu12==12.4.127 nvidia-nvtx-cu12==12.4.127
# via torch # via torch
opencv-python-headless==4.10.0.84 opencv-python-headless==4.11.0.86
# via mistral-common # via
# -r requirements/test.in
# mistral-common
packaging==24.1 packaging==24.1
# via # via
# accelerate # accelerate
...@@ -641,7 +645,7 @@ tqdm==4.66.6 ...@@ -641,7 +645,7 @@ tqdm==4.66.6
# transformers # transformers
tqdm-multiprocess==0.0.11 tqdm-multiprocess==0.0.11
# via lm-eval # via lm-eval
transformers==4.48.2 transformers==4.51.0
# via # via
# -r requirements/test.in # -r requirements/test.in
# genai-perf # genai-perf
......
...@@ -17,9 +17,10 @@ ray[data] ...@@ -17,9 +17,10 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
...@@ -208,6 +208,9 @@ class cmake_build_ext(build_ext): ...@@ -208,6 +208,9 @@ class cmake_build_ext(build_ext):
else: else:
# Default build tool to whatever cmake picks. # Default build tool to whatever cmake picks.
build_tool = [] build_tool = []
# Make sure we use the nvcc from CUDA_HOME
if _is_cuda():
cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
subprocess.check_call( subprocess.check_call(
['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
cwd=self.build_temp) cwd=self.build_temp)
...@@ -664,9 +667,8 @@ def get_requirements() -> list[str]: ...@@ -664,9 +667,8 @@ def get_requirements() -> list[str]:
for line in requirements: for line in requirements:
if line.startswith("-r "): if line.startswith("-r "):
resolved_requirements += _read_requirements(line.split()[1]) resolved_requirements += _read_requirements(line.split()[1])
elif line.startswith("--"): elif not line.startswith("--") and not line.startswith(
continue "#") and line.strip() != "":
else:
resolved_requirements.append(line) resolved_requirements.append(line)
return resolved_requirements return resolved_requirements
...@@ -712,11 +714,10 @@ if _is_cuda() or _is_hip(): ...@@ -712,11 +714,10 @@ if _is_cuda() or _is_hip():
if _is_cuda(): if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"): if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
# FA3 requires CUDA 12.0 or later # FA3 requires CUDA 12.3 or later
ext_modules.append( ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
# Optional since this doesn't get built (produce an .so file) when # Optional since this doesn't get built (produce an .so file) when
# not targeting a hopper system # not targeting a hopper system
ext_modules.append( ext_modules.append(
...@@ -758,7 +759,7 @@ setup( ...@@ -758,7 +759,7 @@ setup(
"fastsafetensors": ["fastsafetensors >= 0.1.10"], "fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"], "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile"], # Required for audio processing "audio": ["librosa", "soundfile"], # Required for audio processing
"video": ["decord"] # Required for video processing "video": [] # Kept for backwards compatibility
}, },
cmdclass=cmdclass, cmdclass=cmdclass,
package_data=package_data, package_data=package_data,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os import os
import pytest
from ..utils import compare_two_settings, models_path_prefix from ..utils import compare_two_settings, models_path_prefix
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
monkeypatch.setenv('VLLM_USE_V1', '0')
def test_cpu_offload(): def test_cpu_offload():
compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), [], compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), [],
["--cpu-offload-gb", "1"]) ["--cpu-offload-gb", "1"])
...@@ -155,6 +155,24 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): ...@@ -155,6 +155,24 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
llm.wake_up() llm.wake_up()
output2 = llm.generate(prompt, sampling_params) output2 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text assert output[0].outputs[0].text == output2[0].outputs[0].text
llm.sleep(level=1)
llm.wake_up(tags=["weights"])
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
# should just reallocate memory for weights (1B model, ~2GiB weights)
if use_v1:
assert used_bytes < 10 * GiB_bytes
else:
assert used_bytes < 6 * GiB_bytes
# now allocate kv cache memory
llm.wake_up(tags=["kv_cache"])
output3 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment