Unverified Commit a1333a8d authored by Alec's avatar Alec Committed by GitHub
Browse files

fix: update vLLM to 0.13.0 with API compatibility fixes (#5222)


Signed-off-by: default avatarVasilis Vagias <vvagias@nvidia.com>
Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarVasilis Vagias <vvagias@nvidia.com>
parent 110cd568
...@@ -484,7 +484,8 @@ def overwrite_args(config): ...@@ -484,7 +484,8 @@ def overwrite_args(config):
ensure_side_channel_host() ensure_side_channel_host()
defaults = { defaults = {
"task": "generate", # vLLM 0.13+ renamed 'task' to 'runner'
"runner": "generate",
# As of vLLM >=0.10.0 the engine unconditionally calls # As of vLLM >=0.10.0 the engine unconditionally calls
# `sampling_params.update_from_tokenizer(...)`, so we can no longer # `sampling_params.update_from_tokenizer(...)`, so we can no longer
# skip tokenizer initialisation. Setting this to **False** avoids # skip tokenizer initialisation. Setting this to **False** avoids
......
...@@ -85,7 +85,6 @@ class ProcessorHandler(ProcessMixIn): ...@@ -85,7 +85,6 @@ class ProcessorHandler(ProcessMixIn):
( (
request, request,
conversation, conversation,
prompt,
engine_prompt, engine_prompt,
sampling_params, sampling_params,
) = await self._parse_raw_request(raw_request) ) = await self._parse_raw_request(raw_request)
......
...@@ -27,7 +27,6 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -27,7 +27,6 @@ from vllm.entrypoints.openai.protocol import (
) )
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_engine import RequestPrompt
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
...@@ -100,7 +99,6 @@ class ProcessMixIn(ProcessMixInRequired): ...@@ -100,7 +99,6 @@ class ProcessMixIn(ProcessMixInRequired):
return ( return (
request, request,
preprocess_result.conversation, preprocess_result.conversation,
preprocess_result.request_prompt,
preprocess_result.engine_prompt, preprocess_result.engine_prompt,
sampling_params, sampling_params,
) )
...@@ -121,11 +119,9 @@ class PreprocessResult: ...@@ -121,11 +119,9 @@ class PreprocessResult:
def __init__( def __init__(
self, self,
conversation: Optional[ConversationMessage], conversation: Optional[ConversationMessage],
request_prompt: RequestPrompt,
engine_prompt: TokensPrompt, engine_prompt: TokensPrompt,
): ):
self.conversation = conversation self.conversation = conversation
self.request_prompt = request_prompt
self.engine_prompt = engine_prompt self.engine_prompt = engine_prompt
...@@ -168,7 +164,6 @@ class ChatProcessor: ...@@ -168,7 +164,6 @@ class ChatProcessor:
( (
conversation, conversation,
request_prompts,
engine_prompts, engine_prompts,
) = await self.openai_serving._preprocess_chat( ) = await self.openai_serving._preprocess_chat(
request, request,
...@@ -185,7 +180,12 @@ class ChatProcessor: ...@@ -185,7 +180,12 @@ class ChatProcessor:
add_special_tokens=request.add_special_tokens, add_special_tokens=request.add_special_tokens,
) )
return PreprocessResult(conversation[0], request_prompts[0], engine_prompts[0]) # In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values
if not conversation or not engine_prompts:
raise ValueError(
"Preprocessing returned empty conversation or engine_prompts"
)
return PreprocessResult(conversation[0], engine_prompts[0])
async def stream_response( async def stream_response(
self, self,
...@@ -305,17 +305,20 @@ class CompletionsProcessor: ...@@ -305,17 +305,20 @@ class CompletionsProcessor:
async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult: async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
request = self.parse_raw_request(raw_request) request = self.parse_raw_request(raw_request)
( # In newer vLLM, _preprocess_completion was removed
request_prompts, # Use the renderer approach instead
engine_prompts, renderer = self.openai_serving._get_renderer(self.tokenizer)
) = await self.openai_serving._preprocess_completion( config = self.openai_serving._build_render_config(request)
request, engine_prompts = await renderer.render_prompt_and_embeds(
self.tokenizer, prompt_or_prompts=request.prompt,
input_or_inputs=request.prompt, prompt_embeds=getattr(request, "prompt_embeds", None),
add_special_tokens=request.add_special_tokens, config=config,
) )
return PreprocessResult(None, request_prompts[0], engine_prompts[0]) # engine_prompts is now a list of TokensPrompt
if not engine_prompts:
raise ValueError("Renderer returned empty engine_prompts")
return PreprocessResult(None, engine_prompts[0])
async def stream_response( async def stream_response(
self, self,
......
...@@ -74,13 +74,13 @@ ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04" ...@@ -74,13 +74,13 @@ ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.9" ARG CUDA_VERSION="12.9"
# Make sure to update the dependency version in pyproject.toml when updating this # Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.12.0" ARG VLLM_REF="v0.13.0"
# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache # FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
ARG FLASHINF_REF="v0.5.3" ARG FLASHINF_REF="v0.5.3"
# If left blank, then we will fallback to vLLM defaults # If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF="" ARG DEEPGEMM_REF=""
ARG LMCACHE_REF="0.3.10" ARG LMCACHE_REF="0.3.12"
################################## ##################################
########## Base Image ############ ########## Base Image ############
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
set -euo pipefail set -euo pipefail
VLLM_VER="0.12.0" VLLM_VER="0.13.0"
VLLM_REF="v${VLLM_VER}" VLLM_REF="v${VLLM_VER}"
# Basic Configurations # Basic Configurations
...@@ -24,8 +24,7 @@ TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to ad ...@@ -24,8 +24,7 @@ TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to ad
DEEPGEMM_REF="" DEEPGEMM_REF=""
CUDA_VERSION="12.9" CUDA_VERSION="12.9"
FLASHINF_REF="v0.5.3" FLASHINF_REF="v0.5.3"
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility LMCACHE_REF="0.3.12"
LMCACHE_REF="0.3.10"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
......
...@@ -64,7 +64,7 @@ The following table shows the dependency versions included with each Dynamo rele ...@@ -64,7 +64,7 @@ The following table shows the dependency versions included with each Dynamo rele
| :------------- | :------------- | :---------------------- | :--------- | :--------------- | :--------- | | :------------- | :------------- | :---------------------- | :--------- | :--------------- | :--------- |
| SGLang | 0.5.6.post2 | 0.5.6.post2 | 0.5.3.post4| 0.5.3.post4 | 0.5.3.post4| | SGLang | 0.5.6.post2 | 0.5.6.post2 | 0.5.3.post4| 0.5.3.post4 | 0.5.3.post4|
| TensorRT-LLM | 1.2.0rc6 | 1.2.0rc6 | 1.2.0rc3 | 1.2.0rc3 | 1.2.0rc2 | | TensorRT-LLM | 1.2.0rc6 | 1.2.0rc6 | 1.2.0rc3 | 1.2.0rc3 | 1.2.0rc2 |
| vLLM | 0.12.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 | | vLLM | 0.13.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 |
| NIXL | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | | NIXL | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 |
> [!Note] > [!Note]
...@@ -77,7 +77,7 @@ The following table shows the dependency versions included with each Dynamo rele ...@@ -77,7 +77,7 @@ The following table shows the dependency versions included with each Dynamo rele
### CUDA Support by Framework ### CUDA Support by Framework
| **Dynamo Version** | **SGLang** | **TensorRT-LLM** | **vLLM** | | **Dynamo Version** | **SGLang** | **TensorRT-LLM** | **vLLM** |
| :------------------- | :-----------------------| :-----------------------| :-----------------------| | :------------------- | :-----------------------| :-----------------------| :-----------------------|
| **Dynamo 0.7.1** | CUDA 12.8 | CUDA 13.0 | CUDA 12.8 | | **Dynamo 0.7.1** | CUDA 12.8 | CUDA 13.0 | CUDA 12.9 |
## Cloud Service Provider Compatibility ## Cloud Service Provider Compatibility
......
...@@ -106,9 +106,9 @@ class ServiceAPI: ...@@ -106,9 +106,9 @@ class ServiceAPI:
) )
# Use vLLM's preprocessing to convert chat to prompt # Use vLLM's preprocessing to convert chat to prompt
# In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values
( (
conversation, conversation,
request_prompts,
engine_prompts, engine_prompts,
) = await self.openai_serving_chat._preprocess_chat( ) = await self.openai_serving_chat._preprocess_chat(
request, request,
......
...@@ -134,7 +134,6 @@ class Processor(ProcessMixIn): ...@@ -134,7 +134,6 @@ class Processor(ProcessMixIn):
( (
request, request,
conversation, conversation,
prompt,
engine_prompt, engine_prompt,
sampling_params, sampling_params,
) = await self._parse_raw_request(raw_request) ) = await self._parse_raw_request(raw_request)
......
...@@ -27,7 +27,6 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -27,7 +27,6 @@ from vllm.entrypoints.openai.protocol import (
) )
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_engine import RequestPrompt
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
...@@ -100,7 +99,6 @@ class ProcessMixIn(ProcessMixInRequired): ...@@ -100,7 +99,6 @@ class ProcessMixIn(ProcessMixInRequired):
return ( return (
request, request,
preprocess_result.conversation, preprocess_result.conversation,
preprocess_result.request_prompt,
preprocess_result.engine_prompt, preprocess_result.engine_prompt,
sampling_params, sampling_params,
) )
...@@ -121,11 +119,9 @@ class PreprocessResult: ...@@ -121,11 +119,9 @@ class PreprocessResult:
def __init__( def __init__(
self, self,
conversation: Optional[ConversationMessage], conversation: Optional[ConversationMessage],
request_prompt: RequestPrompt,
engine_prompt: TokensPrompt, engine_prompt: TokensPrompt,
): ):
self.conversation = conversation self.conversation = conversation
self.request_prompt = request_prompt
self.engine_prompt = engine_prompt self.engine_prompt = engine_prompt
...@@ -168,7 +164,6 @@ class ChatProcessor: ...@@ -168,7 +164,6 @@ class ChatProcessor:
( (
conversation, conversation,
request_prompts,
engine_prompts, engine_prompts,
) = await self.openai_serving._preprocess_chat( ) = await self.openai_serving._preprocess_chat(
request, request,
...@@ -185,7 +180,12 @@ class ChatProcessor: ...@@ -185,7 +180,12 @@ class ChatProcessor:
add_special_tokens=request.add_special_tokens, add_special_tokens=request.add_special_tokens,
) )
return PreprocessResult(conversation[0], request_prompts[0], engine_prompts[0]) # In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values
if not conversation or not engine_prompts:
raise ValueError(
"Preprocessing returned empty conversation or engine_prompts"
)
return PreprocessResult(conversation[0], engine_prompts[0])
async def stream_response( async def stream_response(
self, self,
...@@ -305,17 +305,20 @@ class CompletionsProcessor: ...@@ -305,17 +305,20 @@ class CompletionsProcessor:
async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult: async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
request = self.parse_raw_request(raw_request) request = self.parse_raw_request(raw_request)
( # In newer vLLM, _preprocess_completion was removed
request_prompts, # Use the renderer approach instead
engine_prompts, renderer = self.openai_serving._get_renderer(self.tokenizer)
) = await self.openai_serving._preprocess_completion( config = self.openai_serving._build_render_config(request)
request, engine_prompts = await renderer.render_prompt_and_embeds(
self.tokenizer, prompt_or_prompts=request.prompt,
input_or_inputs=request.prompt, prompt_embeds=getattr(request, "prompt_embeds", None),
add_special_tokens=request.add_special_tokens, config=config,
) )
return PreprocessResult(None, request_prompts[0], engine_prompts[0]) # engine_prompts is now a list of TokensPrompt
if not engine_prompts:
raise ValueError("Renderer returned empty engine_prompts")
return PreprocessResult(None, engine_prompts[0])
async def stream_response( async def stream_response(
self, self,
......
...@@ -56,7 +56,7 @@ trtllm =[ ...@@ -56,7 +56,7 @@ trtllm =[
vllm = [ vllm = [
"uvloop", "uvloop",
"nixl[cu12]<=0.8.0", "nixl[cu12]<=0.8.0",
"vllm[flashinfer,runai]==0.12.0", "vllm[flashinfer,runai]==0.13.0",
] ]
sglang = [ sglang = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment