"components/vscode:/vscode.git/clone" did not exist on "d0a63635849ab1c29f4b3cbe419a19730a575da1"
Unverified Commit 3ad7b7c8 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

fix: broken vLLM nightly gpu_2 test and vLLM API compatibility errors (#7865)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 0a4b5d42
......@@ -63,6 +63,9 @@ python3 -m dynamo.frontend &
#AssertionError: Prefill round robin balance is required when dp size > 1. Please make sure that the prefill instance is launched with `--load-balance-method round_robin` and `--prefill-round-robin-balance` is set for decode server.
# run prefill worker
# NOTE: Each worker picks a random NCCL port (get_free_port) for torch.distributed.
# This has a TOCTOU race — the port can be grabbed before init_process_group binds it,
# causing sporadic EADDRINUSE. Pass --nccl-port <unique_port> per worker to avoid this.
# Use DYN_SYSTEM_PORT1/2 instead of *_PREFILL/*_DECODE env names so test
# harnesses can set one simple pair for disaggregated deployments.
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
......
......@@ -59,6 +59,10 @@ python3 -m dynamo.frontend \
--router-mode kv \
--router-reset-states &
# NOTE: Each worker picks a random NCCL port (get_free_port) for torch.distributed.
# This has a TOCTOU race — the port can be grabbed before init_process_group binds it,
# causing sporadic EADDRINUSE. Pass --nccl-port <unique_port> per worker to avoid this.
# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \
......
......@@ -39,6 +39,9 @@ print_launch_banner "Launching Disaggregated (same GPU)" "$MODEL" "$HTTP_PORT" \
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
# NOTE: Each worker picks a random NCCL port (get_free_port) for torch.distributed.
# This has a TOCTOU race — the port can be grabbed before init_process_group binds it,
# causing sporadic EADDRINUSE. Pass --nccl-port <unique_port> per worker to avoid this.
# run prefill worker with metrics on port 8081
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \
......
......@@ -140,6 +140,9 @@ if [[ "$SINGLE_GPU" == "true" ]]; then
fi
# run SGLang multimodal prefill worker
# NOTE: Each worker picks a random NCCL port (get_free_port) for torch.distributed.
# This has a TOCTOU race — the port can be grabbed before init_process_group binds it,
# causing sporadic EADDRINUSE. Pass --nccl-port <unique_port> per worker to avoid this.
# TODO: Remove disable-radix-cache once the issue is fixed.
# See https://github.com/sgl-project/sglang/pull/11203.
echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
......
......@@ -133,6 +133,9 @@ if [[ "$SINGLE_GPU" == "true" ]]; then
fi
# run SGLang multimodal inference worker
# NOTE: Each worker picks a random NCCL port (get_free_port) for torch.distributed.
# This has a TOCTOU race — the port can be grabbed before init_process_group binds it,
# causing sporadic EADDRINUSE. Pass --nccl-port <unique_port> per worker to avoid this.
# TODO: Remove disable-radix-cache once the issue is fixed.
# See https://github.com/sgl-project/sglang/pull/11203.
echo "Starting PD worker on GPU $DYN_WORKER_GPU (GPU mem: $DYN_WORKER_GPU_MEM)..."
......
......@@ -5,6 +5,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../common/gpu_utils.sh"
# Default values
......@@ -87,7 +88,8 @@ else
fi
# run ingress
python -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run processor
python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
......@@ -95,8 +97,13 @@ python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_T
# run E/P/D workers
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
CUDA_VISIBLE_DEVICES=0 python3 components/audio_encode_worker.py --model $MODEL_NAME &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python3 components/worker.py --model $MODEL_NAME --worker-type prefill $GPU_MEM_ARGS &
CUDA_VISIBLE_DEVICES=0 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 components/audio_encode_worker.py --model $MODEL_NAME &
CUDA_VISIBLE_DEVICES=1 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
python3 components/worker.py --model $MODEL_NAME --worker-type prefill $GPU_MEM_ARGS &
# Wait for all background processes to complete
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,6 +5,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../common/gpu_utils.sh"
# Default values
......@@ -87,17 +88,29 @@ else
fi
# run ingress
python -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run processor
python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT4:-8084} \
python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
CUDA_VISIBLE_DEVICES=0 python3 components/audio_encode_worker.py --model $MODEL_NAME &
DYN_VLLM_KV_EVENT_PORT=20081 VLLM_NIXL_SIDE_CHANNEL_PORT=20098 CUDA_VISIBLE_DEVICES=1 python3 components/worker.py --model $MODEL_NAME --worker-type prefill --enable-disagg $GPU_MEM_ARGS &
DYN_VLLM_KV_EVENT_PORT=20082 VLLM_NIXL_SIDE_CHANNEL_PORT=20099 CUDA_VISIBLE_DEVICES=2 python3 components/worker.py --model $MODEL_NAME --worker-type decode --enable-disagg $GPU_MEM_ARGS &
CUDA_VISIBLE_DEVICES=0 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT3:-8083} \
python3 components/audio_encode_worker.py --model $MODEL_NAME &
CUDA_VISIBLE_DEVICES=1 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
python3 components/worker.py --model $MODEL_NAME --worker-type prefill --enable-disagg $GPU_MEM_ARGS &
CUDA_VISIBLE_DEVICES=2 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
DYN_VLLM_KV_EVENT_PORT=20082 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
python3 components/worker.py --model $MODEL_NAME --worker-type decode --enable-disagg $GPU_MEM_ARGS &
# Wait for all background processes to complete
wait
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -17,7 +17,7 @@ import json
import time
from typing import AsyncIterator, List, Optional, Protocol, Union, runtime_checkable
from vllm.config import ModelConfig
from vllm.config import ModelConfig, VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.chat_utils import ConversationMessage
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
......@@ -27,6 +27,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import RequestResponseMetadata
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.inputs.data import TokensPrompt
from vllm.renderers.registry import renderer_from_config
from vllm.sampling_params import SamplingParams
......@@ -41,7 +42,7 @@ class StubEngineClient:
def __init__(self, model_config: ModelConfig):
self.model_config = model_config
self.renderer = renderer_from_config(model_config)
self.renderer = renderer_from_config(VllmConfig(model_config=model_config))
self.input_processor = None
self.io_processor = None
......@@ -94,7 +95,6 @@ class ProcessMixIn(ProcessMixInRequired):
sampling_params = request.to_sampling_params(
default_max_tokens,
self.model_config.logits_processor_pattern,
self.default_sampling_params,
)
return (
......@@ -138,10 +138,20 @@ class ChatProcessor:
BaseModelPath(name=model_config.model, model_path=model_config.model)
],
)
serving_render = OpenAIServingRender(
model_config=model_config,
renderer=stub_engine.renderer,
io_processor=None,
model_registry=serving_models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
self.openai_serving = OpenAIServingChat(
engine_client=stub_engine,
models=serving_models,
response_role="assistant",
openai_serving_render=serving_render,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
......@@ -285,9 +295,19 @@ class CompletionsProcessor:
BaseModelPath(name=model_config.model, model_path=model_config.model)
],
)
serving_render = OpenAIServingRender(
model_config=model_config,
renderer=stub_engine.renderer,
io_processor=None,
model_registry=serving_models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
self.openai_serving = OpenAIServingCompletion(
engine_client=stub_engine,
models=serving_models,
openai_serving_render=serving_render,
request_logger=None,
)
......
......@@ -27,6 +27,7 @@ class SupportedModels:
LLAVA_1_5_7B = "llava-hf/llava-1.5-7b-hf"
QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct"
LLAVA_NEXT_VIDEO_7B = "llava-hf/LLaVA-NeXT-Video-7B-hf"
QWEN_2_AUDIO_7B = "Qwen/Qwen2-Audio-7B-Instruct"
......@@ -44,26 +45,31 @@ def construct_mm_data(
model: str,
embeddings_dtype: torch.dtype,
image_embeds: Optional[torch.Tensor] = None,
video_numpy: Optional[Any] = None,
image_grid_thw: Optional[List[Any]] = None,
audio_embeds: Optional[torch.Tensor] = None,
) -> Dict[str, torch.Tensor | Dict[str, Any]]:
"""Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
if model == SupportedModels.QWEN_2_AUDIO_7B:
model_lower = model.lower()
if "audio" in model_lower:
audio_embeds = audio_embeds.to(torch.bfloat16)
assert audio_embeds.ndim == 2, "Audio embeddings must be 2D"
return {"audio": [audio_embeds]}
# Handle image models - validate image embeddings first
elif "video" in model_lower:
if video_numpy is None:
raise ValueError("No video frames provided.")
return {"video": video_numpy}
elif "qwen" in model_lower and "vl" in model_lower:
if image_embeds is None:
raise ValueError("No image embeddings provided.")
image_embeds = image_embeds.to(embeddings_dtype)
# Model-specific image handling
if model == SupportedModels.QWEN_2_5_VL_7B:
return _construct_qwen_image_data(image_embeds, image_grid_thw)
else:
# Default image handling for other models (e.g., LLAVA_1_5_7B)
if image_embeds is None:
raise ValueError("No image embeddings provided.")
image_embeds = image_embeds.to(embeddings_dtype)
return {"image": image_embeds}
......
......@@ -592,13 +592,13 @@ vllm_configs = {
directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
script_name="audio_agg.sh",
marks=[
pytest.mark.gpu_2,
pytest.mark.gpu_2, # encode worker loads Qwen2Audio on GPU (~19 GiB)
pytest.mark.nightly,
], # TODO: profile to get max_vram and timeout
pytest.mark.timeout(600),
],
model="Qwen/Qwen2-Audio-7B-Instruct",
delayed_start=60, # Audio models require longer loading time
delayed_start=0,
script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
timeout=600, # 10 minutes for audio processing overhead
request_payloads=[
chat_payload(
[
......@@ -622,13 +622,13 @@ vllm_configs = {
directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
script_name="audio_disagg.sh",
marks=[
pytest.mark.gpu_2,
pytest.mark.gpu_4, # needs 3 GPUs (encode loads Qwen2Audio ~19 GiB + prefill + decode)
pytest.mark.nightly,
], # TODO: profile to get max_vram and timeout
pytest.mark.timeout(600),
],
model="Qwen/Qwen2-Audio-7B-Instruct",
delayed_start=60, # Audio models require longer loading time
delayed_start=0,
script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
timeout=600, # 10 minutes for audio processing overhead
request_payloads=[
chat_payload(
[
......@@ -652,10 +652,10 @@ vllm_configs = {
directory=vllm_dir,
script_name="agg_multimodal.sh",
marks=[
pytest.mark.gpu_2,
pytest.mark.gpu_1, # agg_multimodal.sh uses single GPU
pytest.mark.multimodal,
pytest.mark.nightly,
], # TODO: profile to get max_vram and timeout
],
model="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
script_args=[
"--model",
......@@ -713,7 +713,13 @@ vllm_configs = {
"max_tokens": 1024,
},
repeat_count=1,
expected_response=["purple"], # Validate image understanding
expected_response=[
"green",
"purple",
"llm",
"optimize",
"deploy",
], # OR: pass if any keyword found in tool args
expected_log=[],
expected_tool_name="describe_image", # Validate tool call happened
)
......
......@@ -394,7 +394,13 @@ vllm_configs = {
"max_tokens": 1024,
},
repeat_count=1,
expected_response=["green"], # Validate image understanding
expected_response=[
"green",
"purple",
"llm",
"optimize",
"deploy",
], # OR: pass if any keyword found in tool args
expected_log=[],
expected_tool_name="describe_image", # Validate tool call happened
)
......
......@@ -241,11 +241,14 @@ class ToolCallingChatPayload(ChatPayload):
self.expected_tool_name = expected_tool_name
def validate(self, response, content: str) -> None:
"""Validate that tool calls exist in the response."""
# First run the standard validation
super().validate(response, content)
"""Validate that tool calls exist in the response.
# Then validate tool calls specifically
Skips the parent's expected_response substring check because tool call
responses produce structured JSON arguments, not natural-language text.
The expected_response keywords are instead matched against the
concatenated tool call arguments so callers can still assert that the
model "understood" the input (e.g. expected_response=["purple"]).
"""
response_data = response.json()
choices = response_data.get("choices", [])
assert choices, "Response missing choices"
......@@ -257,13 +260,16 @@ class ToolCallingChatPayload(ChatPayload):
logger.info(f"Tool calls detected: {len(tool_calls)} call(s)")
# Validate tool call structure
all_args = []
for i, tc in enumerate(tool_calls):
assert "function" in tc, f"Tool call {i} missing 'function' field"
function = tc.get("function", {})
assert "name" in function, f"Tool call {i} missing function name"
assert "arguments" in function, f"Tool call {i} missing function arguments"
args_str = function.get("arguments", "")
all_args.append(args_str)
logger.info(
f" [{i}] Function: {function.get('name')}, Args: {function.get('arguments')[:100]}..."
f" [{i}] Function: {function.get('name')}, Args: {args_str[:100]}..."
)
# If expected tool name is provided, validate it
......@@ -274,6 +280,24 @@ class ToolCallingChatPayload(ChatPayload):
), f"Expected tool '{self.expected_tool_name}' not found. Available tools: {tool_names}"
logger.info(f"Expected tool '{self.expected_tool_name}' was called")
# Check expected_response keywords against tool call arguments (OR logic)
if self.expected_response:
combined_args = " ".join(all_args).lower()
found = [kw for kw in self.expected_response if kw.lower() in combined_args]
if not found:
logger.error(
f"VALIDATION FAILED - Expected to find at least one of "
f"{self.expected_response} in tool call arguments"
)
logger.error(f"Tool call arguments: {combined_args}")
raise AssertionError(
f"Expected content not found in tool call arguments. "
f"Expected at least one of: {self.expected_response}. "
f"Tool call arguments: {combined_args}"
)
else:
logger.info(f"Found expected keywords in tool args: {found}")
@dataclass
class CachedTokensChatPayload(ChatPayload):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment