Commit 3fb4b5fa authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.0' into v0.18.0-ori

parents bcf25339 89138b21
......@@ -88,7 +88,7 @@ class RayTrainingActor:
# Zero out all the parameters.
for name, p in self.model.named_parameters():
p.data.zero_()
torch.cuda.synchronize()
torch.accelerator.synchronize()
# The argument for `get_device_uuid` is the index of the GPU in the
# list of visible devices.
from vllm.platforms import current_platform
......@@ -151,7 +151,7 @@ class RayTrainingActor:
p.data.view(-1).view(dtype=torch.uint8), non_blocking=True
)
offset += get_size(p)
torch.cuda.synchronize()
torch.accelerator.synchronize()
s.send_pyobj(named_tensors)
s.recv()
s.send_pyobj(None)
......@@ -159,7 +159,7 @@ class RayTrainingActor:
s.close()
del buffer
gc.collect()
torch.cuda.empty_cache()
torch.accelerator.empty_cache()
# Ray manages four GPUs.
......
......@@ -120,7 +120,7 @@ class ColocateWorkerExtension:
process_weights_after_loading(
self.model_runner.model, self.model_config, self.device
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
socket.send(b"")
break
if isinstance(payload, tuple):
......@@ -144,13 +144,13 @@ class ColocateWorkerExtension:
weights.append((item["name"], tensor))
self.model_runner.model.load_weights(weights=weights)
del weights
torch.cuda.synchronize()
torch.accelerator.synchronize()
socket.send(b"")
socket.close()
del buffer
gc.collect()
torch.cuda.empty_cache()
torch.accelerator.empty_cache()
def report_device_id(self) -> str:
from vllm.platforms import current_platform
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
End-to-end example for routed experts capture with hybrid models.
Validates that:
1. routed_experts is returned in CompletionOutput for MoE models.
2. Expert IDs are within valid range.
3. Results are deterministic across runs (baseline vs reference).
Usage:
python examples/offline_inference/routed_experts_e2e.py \
--model Qwen/Qwen3-30B-A3B \
--tp 4 \
--max-model-len 4096 \
--num-prompts 20 \
--max-new-tokens 50
"""
from __future__ import annotations
import argparse
import asyncio
import logging
import os
import uuid
from dataclasses import dataclass, field
import numpy as np
from vllm.engine.arg_utils import AsyncEngineArgs
logger = logging.getLogger(__name__)
DEFAULT_MODEL = "Qwen/Qwen3-30B-A3B"
TEST_PROMPTS = [
"Hello, my name is",
"The capital of France is",
"Explain quantum computing in simple terms:",
"Write a Python function that sorts a list:",
"The meaning of life is",
"In a distant galaxy, there was a",
"The best way to learn programming is",
"Once upon a time in a land far away,",
"The theory of relativity states that",
"How does photosynthesis work?",
"Describe the process of machine learning:",
"What are the benefits of exercise?",
"The history of artificial intelligence began",
"Translate the following to French: Hello world",
"Summarize the plot of Romeo and Juliet:",
"What is the difference between TCP and UDP?",
"The water cycle consists of",
"Explain how a neural network learns:",
"The periodic table organizes elements by",
"Write a haiku about the ocean:",
]
@dataclass
class InferenceResult:
"""Result from a single inference run."""
experts_list: list[np.ndarray] = field(default_factory=list)
token_ids_list: list[list[int]] = field(default_factory=list)
num_experts: int = 0
# ---------------------------------------------------------------------------
# Inference helpers
# ---------------------------------------------------------------------------
async def _run_async_inference(
engine_args: AsyncEngineArgs,
prompts: list[str],
max_new_tokens: int,
) -> InferenceResult:
"""Run inference using AsyncLLM."""
from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncLLM
engine = AsyncLLM.from_engine_args(engine_args)
hf_config = engine.model_config.hf_text_config
num_experts: int = getattr(hf_config, "num_experts", 0) or getattr(
hf_config, "num_local_experts", 0
)
assert num_experts > 0, "Could not determine num_experts from model config"
sampling_params = SamplingParams(
temperature=0,
max_tokens=max_new_tokens,
)
async def _generate_one(prompt: str, idx: int):
request_id = str(uuid.uuid4())
final_output = None
async for output in engine.generate(prompt, sampling_params, request_id):
final_output = output
assert final_output is not None
completion = final_output.outputs[0]
routed = completion.routed_experts
num_prompt_tokens = len(final_output.prompt_token_ids)
num_generated_tokens = len(completion.token_ids)
expected_len = num_prompt_tokens + num_generated_tokens - 1
assert routed is not None, f"Prompt {idx}: routed_experts is None"
assert routed.shape[0] == expected_len, (
f"Prompt {idx}: routed_experts length {routed.shape[0]} != "
f"prompt ({num_prompt_tokens}) + generated ({num_generated_tokens})"
f" - 1 = {expected_len}"
)
return idx, routed, list(completion.token_ids)
tasks = [_generate_one(p, i) for i, p in enumerate(prompts)]
outputs = await asyncio.gather(*tasks)
# Sort by original index to maintain prompt order
outputs.sort(key=lambda x: x[0])
result = InferenceResult(num_experts=num_experts)
for _, routed, token_ids in outputs:
result.experts_list.append(routed)
result.token_ids_list.append(token_ids)
engine.shutdown()
return result
def run_inference(
model: str,
prompts: list[str],
max_new_tokens: int = 50,
tp: int = 1,
max_model_len: int = 4096,
) -> InferenceResult:
"""Run inference with routed experts capture enabled via AsyncLLM."""
engine_args = AsyncEngineArgs(
model=model,
enable_return_routed_experts=True,
tensor_parallel_size=tp,
max_model_len=max_model_len,
disable_log_stats=True,
attention_backend="FLASH_ATTN",
)
result = asyncio.run(_run_async_inference(engine_args, prompts, max_new_tokens))
from vllm.platforms import current_platform
if current_platform.is_cuda_alike():
current_platform.empty_cache()
return result
# ---------------------------------------------------------------------------
# Validation helpers
# ---------------------------------------------------------------------------
def validate_expert_ids(
experts_list: list[np.ndarray],
num_experts: int,
) -> None:
"""Check that all expert IDs are within valid range [0, num_experts)."""
for i, experts in enumerate(experts_list):
assert np.all(experts >= 0), (
f"Prompt {i}: negative expert IDs found, min={experts.min()}"
)
assert np.all(experts < num_experts), (
f"Prompt {i}: expert ID out of range [0, {num_experts}), "
f"max={experts.max()}"
)
def validate_shapes(experts_list: list[np.ndarray]) -> None:
"""Check that all routed_experts arrays have at least 2 dimensions."""
for i, experts in enumerate(experts_list):
assert experts.ndim >= 2, (
f"Prompt {i}: expected at least 2D array, got shape {experts.shape}"
)
logger.info("Prompt %d: routed_experts shape = %s", i, experts.shape)
# ---------------------------------------------------------------------------
# Comparison helpers
# ---------------------------------------------------------------------------
def compare_token_ids(
baseline: list[list[int]],
reference: list[list[int]],
) -> float:
"""Compare token IDs from two runs. Returns mismatch ratio."""
assert len(baseline) == len(reference), (
f"Length mismatch: {len(baseline)} vs {len(reference)}"
)
total_tokens = 0
total_mismatches = 0
for i, (base, ref) in enumerate(zip(baseline, reference)):
min_len = min(len(base), len(ref))
max_len = max(len(base), len(ref))
matches = 0
for a, b in zip(base[:min_len], ref[:min_len]):
if a != b:
break
matches += 1
total_mismatches += max_len - matches
total_tokens += max_len
if matches < min_len or len(base) != len(ref):
print(
f" Prompt {i}: token_ids len={len(base)} vs {len(ref)}, "
f"mismatches={max_len - matches}/{max_len}"
)
if total_tokens == 0:
raise ValueError("No tokens to compare")
mismatch_ratio = total_mismatches / total_tokens
print(
f"Token ID mismatches: {total_mismatches}/{total_tokens} ({mismatch_ratio:.4%})"
)
return mismatch_ratio
def compare_routed_experts(
baseline: list[np.ndarray],
reference: list[np.ndarray],
threshold: float = 0.05,
) -> float:
"""Compare two runs of routed experts. Returns mismatch ratio.
Raises AssertionError if ratio exceeds threshold.
"""
assert len(baseline) == len(reference), (
f"Length mismatch: {len(baseline)} vs {len(reference)}"
)
total_elements = 0
total_mismatches = 0
for i, (base, ref) in enumerate(zip(baseline, reference)):
min_len = min(len(base), len(ref))
max_len = max(len(base), len(ref))
if min_len == 0:
continue
base_trimmed = base[:min_len]
ref_trimmed = ref[:min_len]
matches = 0
for a, b in zip(base_trimmed, ref_trimmed):
if a.sum() != b.sum():
break
matches += 1
total_mismatches += max_len - matches
total_elements += max_len
if matches < min_len or len(base) != len(ref):
print(
f" Prompt {i}: routed_experts len={len(base)} vs {len(ref)}, "
f"mismatches={max_len - matches}/{max_len}"
)
if total_elements == 0:
raise ValueError("No elements to compare")
mismatch_ratio = total_mismatches / total_elements
print(
f"Routed experts mismatches: {total_mismatches}/{total_elements} "
f"({mismatch_ratio:.4%})"
)
assert mismatch_ratio < threshold, (
f"Too many mismatches: {total_mismatches}/{total_elements} "
f"({mismatch_ratio:.4%}) exceeds threshold {threshold:.4%}"
)
return mismatch_ratio
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def main():
os.environ.setdefault("VLLM_BATCH_INVARIANT", "1")
parser = argparse.ArgumentParser(
description="Test routed experts capture for MoE models"
)
parser.add_argument("--model", type=str, default=DEFAULT_MODEL)
parser.add_argument("--tp", type=int, default=1)
parser.add_argument("--max-model-len", type=int, default=4096)
parser.add_argument("--num-prompts", type=int, default=20)
parser.add_argument("--max-new-tokens", type=int, default=50)
parser.add_argument(
"--deterministic",
action="store_true",
help="Run twice and compare results for determinism check",
)
parser.add_argument(
"--threshold",
type=float,
default=0.05,
help="Maximum allowed mismatch ratio for determinism check",
)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
prompts = TEST_PROMPTS[: args.num_prompts]
print(f"Model: {args.model}")
print(f"TP: {args.tp}")
print(f"Prompts: {len(prompts)}")
print(f"Max new tokens: {args.max_new_tokens}")
print()
print("=== Run 1 (baseline) ===")
baseline = run_inference(
model=args.model,
prompts=prompts,
max_new_tokens=args.max_new_tokens,
tp=args.tp,
max_model_len=args.max_model_len,
)
print(f"num_experts (from model config): {baseline.num_experts}")
print("\n=== Validation ===")
validate_shapes(baseline.experts_list)
validate_expert_ids(baseline.experts_list, num_experts=baseline.num_experts)
print(f"All {len(baseline.experts_list)} results passed validation.")
for i, experts in enumerate(baseline.experts_list):
print(
f" Prompt {i}: shape={experts.shape}, "
f"min={experts.min()}, max={experts.max()}"
)
if args.deterministic:
print("\n=== Run 2 (reference) ===")
reference = run_inference(
model=args.model,
prompts=prompts,
max_new_tokens=args.max_new_tokens,
tp=args.tp,
max_model_len=args.max_model_len,
)
print("\n=== Determinism Check ===")
validate_expert_ids(reference.experts_list, num_experts=baseline.num_experts)
print("\n--- Token IDs ---")
token_mismatch = compare_token_ids(
baseline.token_ids_list, reference.token_ids_list
)
print("\n--- Routed Experts ---")
expert_mismatch = compare_routed_experts(
baseline.experts_list,
reference.experts_list,
threshold=args.threshold,
)
print(
f"\nDeterminism check passed. "
f"Token mismatch: {token_mismatch:.4%}, "
f"Expert mismatch: {expert_mismatch:.4%}"
)
print("\nAll tests passed!")
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
from vllm import LLM, EngineArgs
from vllm.config import ProfilerConfig
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MAX_TOKENS = 16
def create_parser() -> FlexibleArgumentParser:
parser = FlexibleArgumentParser()
EngineArgs.add_cli_args(parser)
parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
batch_group = parser.add_argument_group("Batch parameters")
batch_group.add_argument("--batch-size", type=int, default=1)
batch_group.add_argument("--prompt-size", type=int, default=128)
batch_group.add_argument("--prompt-prefix", type=str, default="Hello, my name is")
profile_group = parser.add_argument_group("Profiling parameters")
profile_group.add_argument(
"--profile",
choices=["none", "prefill", "decode", "both"],
default="none",
)
profile_group.add_argument(
"--profile-dir",
type=str,
default="",
help="Required when --profile is not 'none'.",
)
return parser
def _build_prompt(prefix: str, prompt_size: int) -> str:
if prompt_size <= 0:
return ""
if not prefix:
prefix = " "
if len(prefix) >= prompt_size:
return prefix[:prompt_size]
repeat_count = (prompt_size + len(prefix) - 1) // len(prefix)
return (prefix * repeat_count)[:prompt_size]
def _build_profiler_config(
profile: str, profile_dir: str, max_tokens: int
) -> ProfilerConfig | None:
if profile == "none":
return None
if not profile_dir:
raise ValueError("--profile-dir must be set when profiling is enabled.")
if profile == "prefill":
delay_iterations = 0
max_iterations = 1
elif profile == "decode":
delay_iterations = 1
max_iterations = max(1, max_tokens)
else:
delay_iterations = 0
max_iterations = 0
return ProfilerConfig(
profiler="torch",
torch_profiler_dir=profile_dir,
delay_iterations=delay_iterations,
max_iterations=max_iterations,
)
def main(args: dict) -> None:
max_tokens = DEFAULT_MAX_TOKENS
batch_size = args.pop("batch_size")
prompt_size = args.pop("prompt_size")
prompt_prefix = args.pop("prompt_prefix")
profile = args.pop("profile")
profile_dir = args.pop("profile_dir")
profiler_config = _build_profiler_config(profile, profile_dir, max_tokens)
if profiler_config is not None:
args["profiler_config"] = profiler_config
llm = LLM(**args)
sampling_params = llm.get_default_sampling_params()
sampling_params.max_tokens = max_tokens
sampling_params.min_tokens = max_tokens
sampling_params.ignore_eos = True
prompt = _build_prompt(prompt_prefix, prompt_size)
prompts = [prompt] * batch_size
if profile != "none":
llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
if profile != "none":
llm.stop_profile()
print("-" * 50)
for output in outputs:
generated_text = output.outputs[0].text
print(f"Prompt: {output.prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
if __name__ == "__main__":
parser = create_parser()
main(vars(parser.parse_args()))
......@@ -5,14 +5,9 @@ from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.benchmarks.datasets import add_dataset_parser, get_samples
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.metrics.reader import Counter, Vector
try:
from vllm.utils.argparse_utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
QUESTION = "What is the content of each image?"
IMAGE_URLS = [
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
......
......@@ -34,7 +34,7 @@ deployment methods:
Both platforms provide equivalent monitoring capabilities:
| Dashboard | Description |
|-----------|-------------|
| --------- | ----------- |
| **Performance Statistics** | Tracks latency, throughput, and performance metrics |
| **Query Statistics** | Monitors request volume, query performance, and KPIs |
......
......@@ -349,7 +349,7 @@
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test pause/resume with Data Parallel (DP) via HTTP API.
This example demonstrates coordinated pause/resume across multiple DP ranks.
The pause synchronizes across all DP engines via all-reduce.
Prerequisites:
Start a vLLM server with data parallelism:
$ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \
--enforce-eager \
--data-parallel-size 4 \
--tensor-parallel-size 1
Then run this script:
$ python data_parallel_pause_resume.py
The test verifies pause works by:
1. Starting a streaming generation request
2. Pausing the server mid-generation
3. Sleeping for PAUSE_DURATION seconds
4. Resuming the server
5. Verifying there was a gap in token generation matching the pause duration
"""
import argparse
import threading
import time
import requests
from openai import OpenAI
BASE_URL = "http://localhost:8000"
MODEL_NAME = "facebook/opt-125m"
PAUSE_DURATION = 3.0
def pause_generation(base_url: str, mode: str = "keep") -> None:
"""Pause generation via HTTP endpoint."""
url = f"{base_url}/pause"
response = requests.post(url, params={"mode": mode}, timeout=60)
response.raise_for_status()
print("Server paused")
def resume_generation(base_url: str) -> None:
"""Resume generation via HTTP endpoint."""
url = f"{base_url}/resume"
response = requests.post(url, timeout=60)
response.raise_for_status()
print("Server resumed")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--base-url", default=BASE_URL)
parser.add_argument("--model", default=MODEL_NAME)
args = parser.parse_args()
client = OpenAI(
base_url=f"{args.base_url}/v1",
api_key="EMPTY",
)
prompt = "Write a long story about a dragon. Once upon a time"
token_times: list[float] = []
pause_token_idx = 0
pause_triggered = threading.Event()
def generator_thread():
"""Stream tokens and record timestamps."""
stream = client.completions.create(
model=args.model,
prompt=prompt,
max_tokens=50,
stream=True,
)
for chunk in stream:
if chunk.choices[0].text:
token_times.append(time.monotonic())
token_count = len(token_times)
print(f"Token {token_count}: {chunk.choices[0].text!r}")
# Signal controller after some tokens
if token_count >= 5 and not pause_triggered.is_set():
pause_triggered.set()
def controller_thread():
"""Pause and resume the server."""
nonlocal pause_token_idx
# Wait for some tokens
pause_triggered.wait()
print(f"\nPausing server (keep mode) at token {len(token_times)}...")
pause_generation(args.base_url, mode="keep")
pause_token_idx = len(token_times)
print(f"Sleeping for {PAUSE_DURATION}s...")
time.sleep(PAUSE_DURATION)
print("Resuming server...")
resume_generation(args.base_url)
print("Resumed!\n")
# Run both threads
gen_thread = threading.Thread(target=generator_thread)
ctrl_thread = threading.Thread(target=controller_thread)
gen_thread.start()
ctrl_thread.start()
gen_thread.join()
ctrl_thread.join()
# Check gap at the pause point
if pause_token_idx < len(token_times):
pause_gap = token_times[pause_token_idx] - token_times[pause_token_idx - 1]
print(
f"\nGap after pause (token {pause_token_idx} -> "
f"{pause_token_idx + 1}): {pause_gap:.3f}s"
)
if pause_gap >= PAUSE_DURATION * 0.9:
print("Test passed! Pause synchronized across DP ranks.")
else:
print(f"Test failed! Expected ~{PAUSE_DURATION}s gap, got {pause_gap:.3f}s")
else:
print("Test failed! No tokens were generated after resuming.")
if __name__ == "__main__":
main()
......@@ -95,7 +95,7 @@ If you enable prefill instance (`--prefill-servers-urls` not disabled), you will
## Proxy Instance Flags (`disagg_epd_proxy.py`)
| Flag | Description |
|------|-------------|
| ---- | ----------- |
| `--encode-servers-urls` | Comma-separated list of encoder endpoints. Every multimodal item extracted from the request is fanned out to one of these URLs in a round-robin fashion. |
| `--prefill-servers-urls` | Comma-separated list of prefill endpoints. Set to `disable`, `none`, or `""` to skip the dedicated prefill phase and run E+PD (encoder + combined prefill/decode). |
| `--decode-servers-urls` | Comma-separated list of decode endpoints. Non-stream and stream paths both round-robin over this list. |
......
......@@ -8,7 +8,7 @@ declare -a PIDS=()
###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH
mkdir -p "$LOG_PATH"
ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_PORT="${PREFILL_PORT:-19535}"
......@@ -84,10 +84,10 @@ trap cleanup TERM
# clear previous cache
echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH
rm -rf "$EC_SHARED_STORAGE_PATH"
echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH
mkdir -p "$EC_SHARED_STORAGE_PATH"
###############################################################################
# Encoder worker
......@@ -100,7 +100,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_producer",
......@@ -124,7 +124,7 @@ vllm serve "$MODEL" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer",
......@@ -152,7 +152,7 @@ vllm serve "$MODEL" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--kv-transfer-config '{
"kv_connector": "NixlConnector",
"kv_role": "kv_consumer"
......@@ -162,9 +162,9 @@ vllm serve "$MODEL" \
PIDS+=($!)
# Wait for workers
wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_PORT
wait_for_server $DECODE_PORT
wait_for_server "$ENCODE_PORT"
wait_for_server "$PREFILL_PORT"
wait_for_server "$DECODE_PORT"
###############################################################################
# Proxy
......@@ -179,7 +179,7 @@ python disagg_epd_proxy.py \
PIDS+=($!)
wait_for_server $PROXY_PORT
wait_for_server "$PROXY_PORT"
echo "All services are up!"
###############################################################################
......@@ -187,14 +187,14 @@ echo "All services are up!"
###############################################################################
echo "Running benchmark (stream)..."
vllm bench serve \
--model $MODEL \
--model "$MODEL" \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts $NUM_PROMPTS \
--port $PROXY_PORT
--num-prompts "$NUM_PROMPTS" \
--port "$PROXY_PORT"
PIDS+=($!)
......@@ -202,10 +202,10 @@ PIDS+=($!)
# Single request with local image
###############################################################################
echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
curl http://127.0.0.1:"${PROXY_PORT}"/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL}'",
"model": "'"${MODEL}"'",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
......
......@@ -8,7 +8,7 @@ declare -a PIDS=()
###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH
mkdir -p "$LOG_PATH"
ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
......@@ -78,10 +78,10 @@ trap cleanup TERM
# clear previous cache
echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH
rm -rf "$EC_SHARED_STORAGE_PATH"
echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH
mkdir -p "$EC_SHARED_STORAGE_PATH"
###############################################################################
# Encoder worker
......@@ -94,7 +94,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_producer",
......@@ -115,7 +115,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer",
......@@ -128,8 +128,8 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
PIDS+=($!)
# Wait for workers
wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_DECODE_PORT
wait_for_server "$ENCODE_PORT"
wait_for_server "$PREFILL_DECODE_PORT"
###############################################################################
# Proxy
......@@ -144,7 +144,7 @@ python disagg_epd_proxy.py \
PIDS+=($!)
wait_for_server $PROXY_PORT
wait_for_server "$PROXY_PORT"
echo "All services are up!"
###############################################################################
......@@ -152,14 +152,14 @@ echo "All services are up!"
###############################################################################
echo "Running benchmark (stream)..."
vllm bench serve \
--model $MODEL \
--model "$MODEL" \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts $NUM_PROMPTS \
--port $PROXY_PORT
--num-prompts "$NUM_PROMPTS" \
--port "$PROXY_PORT"
PIDS+=($!)
......@@ -167,10 +167,10 @@ PIDS+=($!)
# Single request with local image
###############################################################################
echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
curl http://127.0.0.1:"${PROXY_PORT}"/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL}'",
"model": "'"${MODEL}"'",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
......
......@@ -54,7 +54,7 @@ wait_for_server() {
# You can also adjust --kv-ip and --kv-port for distributed inference.
# prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
CUDA_VISIBLE_DEVICES=0 vllm serve "$MODEL_NAME" \
--host 0.0.0.0 \
--port 8100 \
--max-model-len 100 \
......@@ -64,7 +64,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
# decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
CUDA_VISIBLE_DEVICES=1 vllm serve "$MODEL_NAME" \
--host 0.0.0.0 \
--port 8200 \
--max-model-len 100 \
......
......@@ -328,9 +328,9 @@ class Proxy:
if instance_type == "decode" and instance in self.decode_instances:
self.decode_instances.remove(instance)
self.decode_cycler = itertools.cycle(self.decode_instances)
if instance_type == "prefill" and instance in self.decode_instances:
if instance_type == "prefill" and instance in self.prefill_instances:
self.prefill_instances.remove(instance)
self.prefill_cycler = itertools.cycle(self.decode_instances)
self.prefill_cycler = itertools.cycle(self.prefill_instances)
class RoundRobinSchedulingPolicy(SchedulingPolicy):
......
......@@ -34,7 +34,7 @@ wait_for_server() {
done" && return 0 || return 1
}
vllm serve $MODEL_NAME \
vllm serve "$MODEL_NAME" \
--port 8100 \
--max-model-len 100 \
--enforce-eager \
......
......@@ -143,7 +143,7 @@ main() {
IFS=',' read -ra BOOTSTRAP_PORT_ARRAY <<< "$BOOTSTRAP_PORTS"
IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
proxy_param=""
proxy_args=()
# =============================================================================
# Launch Prefill Servers (X Producers)
......@@ -156,12 +156,12 @@ main() {
local bootstrap_port=${BOOTSTRAP_PORT_ARRAY[$i]}
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, Bootstrap Port $bootstrap_port"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--port $port \
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--port "$port" \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" > prefill$((i+1)).log 2>&1 &
PIDS+=($!)
proxy_param="${proxy_param} --prefill http://0.0.0.0:${port} $bootstrap_port"
proxy_args+=(--prefill "http://0.0.0.0:${port}" "$bootstrap_port")
done
# =============================================================================
......@@ -174,12 +174,12 @@ main() {
local port=${DECODE_PORT_ARRAY[$i]}
echo " Decode server $((i+1)): GPU $gpu_id, Port $port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--port $port \
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--port "$port" \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\"}" > decode$((i+1)).log 2>&1 &
PIDS+=($!)
proxy_param="${proxy_param} --decode http://0.0.0.0:${port}"
proxy_args+=(--decode "http://0.0.0.0:${port}")
done
# =============================================================================
......@@ -187,7 +187,7 @@ main() {
# =============================================================================
echo ""
echo "Starting proxy server on port $PROXY_PORT..."
python3 mooncake_connector_proxy.py $proxy_param --port $PROXY_PORT > proxy.log 2>&1 &
python3 mooncake_connector_proxy.py "${proxy_args[@]}" --port "$PROXY_PORT" > proxy.log 2>&1 &
PIDS+=($!)
# =============================================================================
......@@ -196,9 +196,10 @@ main() {
echo ""
echo "Waiting for all servers to start..."
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
if ! wait_for_server $port; then
if ! wait_for_server "$port"; then
echo "Failed to start server on port $port"
cleanup
# shellcheck disable=SC2317
exit 1
fi
done
......@@ -209,8 +210,8 @@ main() {
# =============================================================================
# Run Benchmark
# =============================================================================
vllm bench serve --port $PROXY_PORT --seed $(date +%s) \
--backend vllm --model $MODEL \
vllm bench serve --port "$PROXY_PORT" --seed "$(date +%s)" \
--backend vllm --model "$MODEL" \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
......
......@@ -14,6 +14,10 @@ import regex as re
import zmq
from quart import Quart, make_response, request
from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_common import (
MoRIIOConstants,
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
prefill_instances: list[dict] = []
......@@ -213,6 +217,8 @@ async def handle_request():
dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"])
transfer_id = f"{MoRIIOConstants.TRANSFER_PREFIX}-{str(uuid.uuid4())}"
req_data_to_prefill = copy.deepcopy(req_data)
req_data_to_prefill["kv_transfer_params"] = {}
req_data["kv_transfer_params"] = {}
......@@ -222,6 +228,7 @@ async def handle_request():
req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = (
decode_instance_endpoint["tp_size"]
)
req_data_to_prefill["kv_transfer_params"]["transfer_id"] = transfer_id
send_prefill_task = asyncio.create_task(
send_request_to_prefill(
......@@ -267,6 +274,7 @@ async def handle_request():
if selected_prefill_dp_rank is not None:
req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank
req_data["kv_transfer_params"]["transfer_id"] = transfer_id
decode_request_task = asyncio.create_task(
start_decode_request(
......
......@@ -166,10 +166,10 @@ main() {
local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
--port "$port" \
--tensor-parallel-size 1 \
--seed 1024 \
--dtype float16 \
......@@ -194,10 +194,10 @@ main() {
local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
--port "$port" \
--tensor-parallel-size 1 \
--seed 1024 \
--dtype float16 \
......@@ -217,9 +217,10 @@ main() {
echo ""
echo "Waiting for all servers to start..."
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
if ! wait_for_server $port; then
if ! wait_for_server "$port"; then
echo "Failed to start server on port $port"
cleanup
# shellcheck disable=SC2317
exit 1
fi
done
......@@ -231,8 +232,8 @@ main() {
# Run Benchmark
# =============================================================================
cd ../../../benchmarks/
vllm bench serve --port 10001 --seed $(date +%s) \
--model $MODEL \
vllm bench serve --port 10001 --seed "$(date +%s)" \
--model "$MODEL" \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
......
#!/bin/bash
set -euo pipefail
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
PORT="${PORT:-8000}"
GPU="${GPU:-0}"
NUM_PROMPTS="${NUM_PROMPTS:-200}"
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
TIMEOUT="${TIMEOUT:-600}"
SERVER_PID=""
cleanup() {
echo "Stopping server..."
if [[ -n "$SERVER_PID" ]] && kill -0 "$SERVER_PID" 2>/dev/null; then
kill "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
fi
echo "Done."
}
trap cleanup EXIT INT TERM
wait_for_server() {
local deadline=$((SECONDS + TIMEOUT))
echo "Waiting for server on port $PORT..."
while (( SECONDS < deadline )); do
if curl -sf "http://localhost:${PORT}/v1/models" > /dev/null 2>&1; then
echo "Server ready."
return 0
fi
sleep 2
done
echo "ERROR: Server did not start within ${TIMEOUT}s"
return 1
}
rm -rf "$EC_SHARED_STORAGE_PATH"
mkdir -p "$EC_SHARED_STORAGE_PATH"
###############################################################################
# Start server with ec_both
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU" \
vllm serve "$MODEL" \
--port "$PORT" \
--enforce-eager \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_both",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}' \
"$@" &
SERVER_PID=$!
wait_for_server
###############################################################################
# Benchmark -- dataset contains duplicate images, exercises cache hits
###############################################################################
echo "Running benchmark ($NUM_PROMPTS prompts)..."
vllm bench serve \
--model "$MODEL" \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts "$NUM_PROMPTS" \
--port "$PORT"
echo "Benchmark complete."
......@@ -50,8 +50,8 @@ while [[ $# -gt 0 ]]; do
done
vllm bench serve \
--model $MODEL_NAME \
--host $HOST \
--port $PORT \
--num-prompts $NUM_PROMPTS \
--request-rate $REQUEST_RATE
--model "$MODEL_NAME" \
--host "$HOST" \
--port "$PORT" \
--num-prompts "$NUM_PROMPTS" \
--request-rate "$REQUEST_RATE"
......@@ -57,15 +57,15 @@ echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALL
export RAY_DEDUP_LOGS=0
export VLLM_USE_DEEP_GEMM=1
vllm serve $MODEL_NAME \
--data-parallel-size $DATA_PARALLEL_SIZE \
--data-parallel-size-local $DATA_PARALLEL_SIZE \
vllm serve "$MODEL_NAME" \
--data-parallel-size "$DATA_PARALLEL_SIZE" \
--data-parallel-size-local "$DATA_PARALLEL_SIZE" \
--data-parallel-backend ray \
--enforce-eager \
--enable-expert-parallel \
--enable-eplb \
--all2all-backend pplx \
--num-redundant-experts $REDUNDANT_EXPERTS \
--all2all-backend allgather_reducescatter \
--num-redundant-experts "$REDUNDANT_EXPERTS" \
--trust-remote-code \
--host $HOST \
--port $PORT
--host "$HOST" \
--port "$PORT"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment