Unverified Commit d5803cbe authored by Alec's avatar Alec Committed by GitHub
Browse files

chore: bump vLLM to 0.19.0 (#7894)

parent ad3a46a6
......@@ -14,7 +14,7 @@ from collections.abc import AsyncGenerator
from typing import Any
from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
from vllm.inputs.data import TokensPrompt
from vllm.inputs import TokensPrompt
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.tasks import GENERATION_TASKS
......
......@@ -20,7 +20,7 @@ from typing import AsyncIterator, List, Optional, Protocol, Union, runtime_check
from vllm.config import ModelConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.chat_utils import ConversationMessage
from vllm.inputs.data import TokensPrompt
from vllm.inputs import TokensPrompt
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import TokenizerLike as AnyTokenizer
......
......@@ -22,9 +22,8 @@ import torch
from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
from pydantic_core import core_schema
from typing_extensions import NotRequired
from vllm.inputs.data import TokensPrompt
from vllm.inputs import MultiModalUUIDDict, TokensPrompt # noqa: F401
from vllm.logprobs import PromptLogprobs
from vllm.multimodal.inputs import MultiModalUUIDDict # noqa: F401
from vllm.outputs import CompletionOutput
from vllm.sampling_params import SamplingParams
from vllm.v1.metrics.stats import RequestStateStats
......@@ -55,7 +54,7 @@ class PrefillResponse(BaseModel):
# Hack to override the type of multi_modal_data in TokensPrompt
# as pydantic doesn't understand generic types
# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
# TokensPrompt is exported from vllm.inputs and implemented in vllm/inputs/llm.py.
# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
class PatchedTokensPrompt(TokensPrompt):
......
......@@ -194,3 +194,27 @@ class TestVllmKvEventsApi:
assert decoded[6] == "GPU", f"medium at wrong position: {decoded[6]}"
assert decoded[7] is None, f"lora_name at wrong position: {decoded[7]}"
assert decoded[8] is None, f"extra_keys at wrong position: {decoded[8]}"
def test_block_stored_tuple_extra_keys_serialization_format(self):
"""Verify multimodal tuple extra_keys keep the vLLM 0.19 wire shape."""
import msgspec
mm_hash = "0123456789abcdef00112233445566778899aabbccddeefffedcba9876543210"
event = BlockStored(
block_hashes=[123],
parent_block_hash=None,
token_ids=[1, 2, 3, 4],
block_size=16,
lora_id=None,
medium="GPU",
lora_name=None,
extra_keys=[((mm_hash, 7),)],
)
decoded = msgspec.msgpack.decode(msgspec.msgpack.encode(event))
assert decoded[0] == "BlockStored"
assert decoded[8] == [[[mm_hash, 7]]], (
"vLLM multimodal extra_keys no longer serialize as nested tuple/list "
f"payloads. Decoded: {decoded[8]!r}"
)
......@@ -23,7 +23,7 @@ _chat_protocol = importlib.import_module(
"vllm.entrypoints.openai.chat_completion.protocol"
)
_engine_protocol = importlib.import_module("vllm.entrypoints.openai.engine.protocol")
_inputs_data = importlib.import_module("vllm.inputs.data")
_inputs_data = importlib.import_module("vllm.inputs")
_reasoning = importlib.import_module("vllm.reasoning")
_sampling_params = importlib.import_module("vllm.sampling_params")
_tool_parsers = importlib.import_module("vllm.tool_parsers")
......
......@@ -44,13 +44,13 @@ vllm:
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: 12.9.1-runtime-ubuntu24.04
vllm_ref: v0.18.0
vllm_ref: v0.19.0
cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: 13.0.2-runtime-ubuntu24.04
vllm_ref: v0.18.0
vllm_ref: v0.19.0
xpu:
base_image: intel/deep-learning-essentials
runtime_image: intel/deep-learning-essentials
......@@ -65,7 +65,7 @@ vllm:
vllm_ref: v0.16.0
flashinf_ref: v0.6.6
lmcache_ref: 0.4.2
vllm_omni_ref: "v0.18.0"
vllm_omni_ref: "release/v0.19.0rc1"
nixl_ref: 0.10.1
max_jobs: "10"
enable_media_ffmpeg: "false"
......
......@@ -12,7 +12,7 @@
set -euo pipefail
VLLM_VER="0.18.0"
VLLM_VER="0.19.0"
VLLM_REF="v${VLLM_VER}"
DEVICE="cuda"
......@@ -141,6 +141,25 @@ cd vllm
git checkout $VLLM_REF
echo "✓ vLLM repository cloned"
echo "\n=== Installing vLLM-Omni ==="
# Install omni BEFORE vLLM. Its transitive dependencies can otherwise upgrade the
# torch/transformers stack after vLLM is installed, which can leave vllm._C ABI-mismatched.
# vLLM should remain the final owner of the runtime stack in this environment.
if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
# Try PyPI first, fall back to building from source
if uv pip install vllm-omni==${VLLM_OMNI_REF#v} 2>&1; then
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from PyPI"
else
echo "⚠ PyPI install failed, building from source..."
git clone --depth 1 --branch ${VLLM_OMNI_REF} https://github.com/vllm-project/vllm-omni.git $INSTALLATION_DIR/vllm-omni
uv pip install $INSTALLATION_DIR/vllm-omni
rm -rf $INSTALLATION_DIR/vllm-omni
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from source"
fi
else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi
if [ "$DEVICE" = "xpu" ]; then
echo "\n=== Installing vLLM ==="
uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match
......@@ -240,29 +259,6 @@ else
echo "⚠ Skipping LMCache (ARM64 or CUDA 13 not supported)"
fi
echo "\n=== Installing vLLM-Omni ==="
if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
# Save original vllm entrypoint before vllm-omni overwrites it
VLLM_BIN=$(which vllm)
cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup
# Try PyPI first, fall back to building from source
if uv pip install vllm-omni==${VLLM_OMNI_REF#v} 2>&1; then
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from PyPI"
else
echo "⚠ PyPI install failed, building from source..."
git clone --depth 1 --branch ${VLLM_OMNI_REF} https://github.com/vllm-project/vllm-omni.git $INSTALLATION_DIR/vllm-omni
uv pip install $INSTALLATION_DIR/vllm-omni
rm -rf $INSTALLATION_DIR/vllm-omni
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from source"
fi
# Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
cp /tmp/vllm-entrypoint-backup "$VLLM_BIN"
echo "✓ Original vllm entrypoint preserved"
else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi
if [ "$DEVICE" = "cuda" ]; then
echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools
......
......@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna
| **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
| :--- | :--- | :--- | :--- | :--- |
| **main (ToT)** | `0.5.9` | `1.3.0rc9` | `0.18.0` | `0.10.1` |
| **main (ToT)** | `0.5.9` | `1.3.0rc9` | `0.19.0` | `0.10.1` |
| **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` |
| **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
| **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
......
......@@ -46,6 +46,9 @@ FRONTEND_PID=$!
sleep 2
echo "Starting Omni Audio worker..."
# Upstream qwen3_tts stage configs still use a 65536 stage-1 max_model_len.
# vLLM 0.19 validates that against the model config unless we opt in here.
VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm.omni \
--model "$MODEL" \
......
......@@ -113,12 +113,27 @@ pub fn parse_mm_hash_from_extra_key(s: &str) -> Option<u64> {
None
}
#[derive(Debug, Deserialize, Clone)]
#[serde(untagged)]
pub enum ExtraKeyItem {
Hash(String),
HashWithSignedOffset((String, i64)),
HashWithUnsignedOffset((String, u64)),
Bytes(Vec<u8>),
Signed(i64),
Unsigned(u64),
Float(f64),
Bool(bool),
}
/// Convert vLLM BlockStored extra_keys to block-level MM infos.
/// extra_keys is a list aligned with blocks:
/// - None => no MM content in that block
/// - ["hash1", "hash2", ...] => one or more MM objects in that block
/// - [[hash, start_offset], ...] => one or more MM objects with block-relative
/// start offsets (vLLM 0.19+)
pub fn extra_keys_to_block_mm_infos(
extra_keys: Option<Vec<Option<Vec<String>>>>,
extra_keys: Option<Vec<Option<Vec<ExtraKeyItem>>>>,
) -> Option<Vec<Option<BlockExtraInfo>>> {
let extra_keys = extra_keys?;
if extra_keys.is_empty() {
......@@ -131,10 +146,24 @@ pub fn extra_keys_to_block_mm_infos(
let mm_objects: Vec<BlockMmObjectInfo> = block_keys
.unwrap_or_default()
.iter()
.filter_map(|key| parse_mm_hash_from_extra_key(key))
.filter_map(|key| match key {
ExtraKeyItem::Hash(hash)
| ExtraKeyItem::HashWithSignedOffset((hash, _))
| ExtraKeyItem::HashWithUnsignedOffset((hash, _)) => {
parse_mm_hash_from_extra_key(hash)
}
ExtraKeyItem::Bytes(_)
| ExtraKeyItem::Signed(_)
| ExtraKeyItem::Unsigned(_)
| ExtraKeyItem::Float(_)
| ExtraKeyItem::Bool(_) => None,
})
.map(|mm_hash| BlockMmObjectInfo {
mm_hash,
offsets: vec![], // extra_keys does not carry offsets today
// vLLM extra_keys exposes MM start offsets but not MM lengths.
// Dynamo's block hash only depends on mm_hash today, so keep
// offsets empty rather than inventing a synthetic range.
offsets: vec![],
})
.collect();
......@@ -193,7 +222,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
let mut block_size: Option<usize> = None;
let mut medium: Option<Option<String>> = None;
let mut lora_name: Option<Option<String>> = None;
let mut extra_keys: Option<Option<Vec<Option<Vec<String>>>>> = None;
let mut extra_keys: Option<Option<Vec<Option<Vec<ExtraKeyItem>>>>> = None;
let mut block_mm_infos: Option<Option<Vec<Option<BlockExtraInfo>>>> = None;
while let Some(key) = map.next_key::<String>()? {
......@@ -308,7 +337,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
let _lora_id: Option<u64> = seq.next_element()?.unwrap_or(None);
let medium: Option<String> = seq.next_element()?.unwrap_or(None);
let lora_name: Option<String> = seq.next_element()?.unwrap_or(None);
let extra_keys: Option<Vec<Option<Vec<String>>>> =
let extra_keys: Option<Vec<Option<Vec<ExtraKeyItem>>>> =
seq.next_element()?.unwrap_or(None);
let block_mm_infos: Option<Vec<Option<BlockExtraInfo>>> =
seq.next_element()?.unwrap_or(None);
......
......@@ -333,9 +333,12 @@ mod test_event_processing {
let mm_hash =
"0123456789abcdef00112233445566778899aabbccddeefffedcba9876543210".to_string();
let infos = extra_keys_to_block_mm_infos(Some(vec![
Some(vec![mm_hash.clone()]),
Some(vec![ExtraKeyItem::Hash(mm_hash.clone())]),
None,
Some(vec!["invalid".to_string(), mm_hash]),
Some(vec![
ExtraKeyItem::Hash("invalid".to_string()),
ExtraKeyItem::Hash(mm_hash),
]),
]))
.expect("expected parsed MM infos");
......@@ -383,6 +386,32 @@ mod test_event_processing {
);
}
#[test]
fn test_seq_block_stored_field8_supports_tuple_extra_keys() {
let mm_hash =
"0123456789abcdef00112233445566778899aabbccddeefffedcba9876543210".to_string();
let extra_keys_payload = rmps::to_vec(&(
"BlockStored",
vec![10_u64],
None::<u64>,
vec![1_u32, 2, 3, 4],
4_usize,
None::<u64>,
None::<String>,
None::<String>,
vec![Some(vec![(mm_hash, 7_i64)])],
))
.unwrap();
let extra_keys_event: RawKvEvent = rmps::from_slice(&extra_keys_payload).unwrap();
let RawKvEvent::BlockStored { block_mm_infos, .. } = extra_keys_event else {
panic!("expected BlockStored");
};
assert_eq!(
block_mm_infos.unwrap()[0].as_ref().unwrap().mm_objects[0].mm_hash,
0x0123_4567_89ab_cdef
);
}
#[test]
fn test_map_block_stored_supports_extra_keys() {
#[derive(serde::Serialize)]
......@@ -423,6 +452,49 @@ mod test_event_processing {
0x0123_4567_89ab_cdef
);
}
#[test]
fn test_map_block_stored_supports_tuple_extra_keys() {
type BlockTupleExtraKeys = Option<Vec<Option<Vec<(String, i64)>>>>;
#[derive(serde::Serialize)]
struct MapBlockStoredEvent {
#[serde(rename = "type")]
event_type: &'static str,
block_hashes: Vec<u64>,
parent_block_hash: Option<u64>,
token_ids: Vec<u32>,
block_size: usize,
lora_id: Option<u64>,
medium: Option<String>,
lora_name: Option<String>,
extra_keys: BlockTupleExtraKeys,
}
let mm_hash =
"0123456789abcdef00112233445566778899aabbccddeefffedcba9876543210".to_string();
let payload = rmps::to_vec(&MapBlockStoredEvent {
event_type: "BlockStored",
block_hashes: vec![10],
parent_block_hash: None,
token_ids: vec![1, 2, 3, 4],
block_size: 4,
lora_id: None,
medium: Some("GPU".to_string()),
lora_name: None,
extra_keys: Some(vec![Some(vec![(mm_hash, 3)])]),
})
.unwrap();
let event: RawKvEvent = rmps::from_slice(&payload).unwrap();
let RawKvEvent::BlockStored { block_mm_infos, .. } = event else {
panic!("expected BlockStored");
};
assert_eq!(
block_mm_infos.unwrap()[0].as_ref().unwrap().mm_objects[0].mm_hash,
0x0123_4567_89ab_cdef
);
}
}
#[cfg(test)]
......
......@@ -50,11 +50,11 @@ trtllm =[
vllm = [
"uvloop",
"nixl[cu12]<=0.10.1",
"vllm[flashinfer,runai,otel]==0.18.0",
# vllm-omni 0.18.0 is now on PyPI; install only future rc builds from source in container builds
# (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
# not include vllm-omni — install it separately from source if needed.
"vllm-omni==0.18.0",
"vllm[flashinfer,runai,otel]==0.19.0",
# vllm-omni is installed separately in container builds (see
# container/deps/vllm/install_vllm.sh). Do not add it to ai-dynamo[vllm]:
# pip/uv dependency resolution for omni can override the vLLM torch stack.
# "vllm-omni==...",
"blake3>=1.0.0,<2.0.0",
"soundfile>=0.13.1",
"librosa>=0.10.0",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment