"deploy/vscode:/vscode.git/clone" did not exist on "5d90e530bc4ff683a779b2bc0b9237cfcc2504fd"
Unverified Commit a1333a8d authored by Alec's avatar Alec Committed by GitHub
Browse files

fix: update vLLM to 0.13.0 with API compatibility fixes (#5222)


Signed-off-by: default avatarVasilis Vagias <vvagias@nvidia.com>
Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarVasilis Vagias <vvagias@nvidia.com>
parent 110cd568
......@@ -484,7 +484,8 @@ def overwrite_args(config):
ensure_side_channel_host()
defaults = {
"task": "generate",
# vLLM 0.13+ renamed 'task' to 'runner'
"runner": "generate",
# As of vLLM >=0.10.0 the engine unconditionally calls
# `sampling_params.update_from_tokenizer(...)`, so we can no longer
# skip tokenizer initialisation. Setting this to **False** avoids
......
......@@ -85,7 +85,6 @@ class ProcessorHandler(ProcessMixIn):
(
request,
conversation,
prompt,
engine_prompt,
sampling_params,
) = await self._parse_raw_request(raw_request)
......
......@@ -27,7 +27,6 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_engine import RequestPrompt
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import SamplingParams
......@@ -100,7 +99,6 @@ class ProcessMixIn(ProcessMixInRequired):
return (
request,
preprocess_result.conversation,
preprocess_result.request_prompt,
preprocess_result.engine_prompt,
sampling_params,
)
......@@ -121,11 +119,9 @@ class PreprocessResult:
def __init__(
self,
conversation: Optional[ConversationMessage],
request_prompt: RequestPrompt,
engine_prompt: TokensPrompt,
):
self.conversation = conversation
self.request_prompt = request_prompt
self.engine_prompt = engine_prompt
......@@ -168,7 +164,6 @@ class ChatProcessor:
(
conversation,
request_prompts,
engine_prompts,
) = await self.openai_serving._preprocess_chat(
request,
......@@ -185,7 +180,12 @@ class ChatProcessor:
add_special_tokens=request.add_special_tokens,
)
return PreprocessResult(conversation[0], request_prompts[0], engine_prompts[0])
# In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values
if not conversation or not engine_prompts:
raise ValueError(
"Preprocessing returned empty conversation or engine_prompts"
)
return PreprocessResult(conversation[0], engine_prompts[0])
async def stream_response(
self,
......@@ -305,17 +305,20 @@ class CompletionsProcessor:
async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
request = self.parse_raw_request(raw_request)
(
request_prompts,
engine_prompts,
) = await self.openai_serving._preprocess_completion(
request,
self.tokenizer,
input_or_inputs=request.prompt,
add_special_tokens=request.add_special_tokens,
# In newer vLLM, _preprocess_completion was removed
# Use the renderer approach instead
renderer = self.openai_serving._get_renderer(self.tokenizer)
config = self.openai_serving._build_render_config(request)
engine_prompts = await renderer.render_prompt_and_embeds(
prompt_or_prompts=request.prompt,
prompt_embeds=getattr(request, "prompt_embeds", None),
config=config,
)
return PreprocessResult(None, request_prompts[0], engine_prompts[0])
# engine_prompts is now a list of TokensPrompt
if not engine_prompts:
raise ValueError("Renderer returned empty engine_prompts")
return PreprocessResult(None, engine_prompts[0])
async def stream_response(
self,
......
......@@ -74,13 +74,13 @@ ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.9"
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.12.0"
ARG VLLM_REF="v0.13.0"
# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
ARG FLASHINF_REF="v0.5.3"
# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""
ARG LMCACHE_REF="0.3.10"
ARG LMCACHE_REF="0.3.12"
##################################
########## Base Image ############
......
......@@ -11,7 +11,7 @@
set -euo pipefail
VLLM_VER="0.12.0"
VLLM_VER="0.13.0"
VLLM_REF="v${VLLM_VER}"
# Basic Configurations
......@@ -24,8 +24,7 @@ TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to ad
DEEPGEMM_REF=""
CUDA_VERSION="12.9"
FLASHINF_REF="v0.5.3"
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
LMCACHE_REF="0.3.10"
LMCACHE_REF="0.3.12"
while [[ $# -gt 0 ]]; do
case $1 in
......
......@@ -64,7 +64,7 @@ The following table shows the dependency versions included with each Dynamo rele
| :------------- | :------------- | :---------------------- | :--------- | :--------------- | :--------- |
| SGLang | 0.5.6.post2 | 0.5.6.post2 | 0.5.3.post4| 0.5.3.post4 | 0.5.3.post4|
| TensorRT-LLM | 1.2.0rc6 | 1.2.0rc6 | 1.2.0rc3 | 1.2.0rc3 | 1.2.0rc2 |
| vLLM | 0.12.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 |
| vLLM | 0.13.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 |
| NIXL | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 |
> [!Note]
......@@ -77,7 +77,7 @@ The following table shows the dependency versions included with each Dynamo rele
### CUDA Support by Framework
| **Dynamo Version** | **SGLang** | **TensorRT-LLM** | **vLLM** |
| :------------------- | :-----------------------| :-----------------------| :-----------------------|
| **Dynamo 0.7.1** | CUDA 12.8 | CUDA 13.0 | CUDA 12.8 |
| **Dynamo 0.7.1** | CUDA 12.8 | CUDA 13.0 | CUDA 12.9 |
## Cloud Service Provider Compatibility
......
......@@ -106,9 +106,9 @@ class ServiceAPI:
)
# Use vLLM's preprocessing to convert chat to prompt
# In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values
(
conversation,
request_prompts,
engine_prompts,
) = await self.openai_serving_chat._preprocess_chat(
request,
......
......@@ -134,7 +134,6 @@ class Processor(ProcessMixIn):
(
request,
conversation,
prompt,
engine_prompt,
sampling_params,
) = await self._parse_raw_request(raw_request)
......
......@@ -27,7 +27,6 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_engine import RequestPrompt
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import SamplingParams
......@@ -100,7 +99,6 @@ class ProcessMixIn(ProcessMixInRequired):
return (
request,
preprocess_result.conversation,
preprocess_result.request_prompt,
preprocess_result.engine_prompt,
sampling_params,
)
......@@ -121,11 +119,9 @@ class PreprocessResult:
def __init__(
self,
conversation: Optional[ConversationMessage],
request_prompt: RequestPrompt,
engine_prompt: TokensPrompt,
):
self.conversation = conversation
self.request_prompt = request_prompt
self.engine_prompt = engine_prompt
......@@ -168,7 +164,6 @@ class ChatProcessor:
(
conversation,
request_prompts,
engine_prompts,
) = await self.openai_serving._preprocess_chat(
request,
......@@ -185,7 +180,12 @@ class ChatProcessor:
add_special_tokens=request.add_special_tokens,
)
return PreprocessResult(conversation[0], request_prompts[0], engine_prompts[0])
# In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values
if not conversation or not engine_prompts:
raise ValueError(
"Preprocessing returned empty conversation or engine_prompts"
)
return PreprocessResult(conversation[0], engine_prompts[0])
async def stream_response(
self,
......@@ -305,17 +305,20 @@ class CompletionsProcessor:
async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
request = self.parse_raw_request(raw_request)
(
request_prompts,
engine_prompts,
) = await self.openai_serving._preprocess_completion(
request,
self.tokenizer,
input_or_inputs=request.prompt,
add_special_tokens=request.add_special_tokens,
# In newer vLLM, _preprocess_completion was removed
# Use the renderer approach instead
renderer = self.openai_serving._get_renderer(self.tokenizer)
config = self.openai_serving._build_render_config(request)
engine_prompts = await renderer.render_prompt_and_embeds(
prompt_or_prompts=request.prompt,
prompt_embeds=getattr(request, "prompt_embeds", None),
config=config,
)
return PreprocessResult(None, request_prompts[0], engine_prompts[0])
# engine_prompts is now a list of TokensPrompt
if not engine_prompts:
raise ValueError("Renderer returned empty engine_prompts")
return PreprocessResult(None, engine_prompts[0])
async def stream_response(
self,
......
......@@ -56,7 +56,7 @@ trtllm =[
vllm = [
"uvloop",
"nixl[cu12]<=0.8.0",
"vllm[flashinfer,runai]==0.12.0",
"vllm[flashinfer,runai]==0.13.0",
]
sglang = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment