Unverified Commit eb0bf24e authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

feat: Add NVTX markers for vLLM EPD (#6627)

parent e15685d8
...@@ -20,6 +20,7 @@ from pydantic import BaseModel ...@@ -20,6 +20,7 @@ from pydantic import BaseModel
from safetensors import torch as safetensors_torch from safetensors import torch as safetensors_torch
import dynamo.nixl_connect as nixl_connect import dynamo.nixl_connect as nixl_connect
from dynamo.common.utils import nvtx_utils as _nvtx
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -141,6 +142,7 @@ class LocalEmbeddingSender(AbstractEmbeddingSender): ...@@ -141,6 +142,7 @@ class LocalEmbeddingSender(AbstractEmbeddingSender):
) )
return tensor_path return tensor_path
@_nvtx.annotate("mm:local:send_embeddings", color="magenta")
async def send_embeddings( async def send_embeddings(
self, embeddings: torch.Tensor, stage_embeddings: bool = False self, embeddings: torch.Tensor, stage_embeddings: bool = False
) -> tuple[TransferRequest, asyncio.Future]: ) -> tuple[TransferRequest, asyncio.Future]:
...@@ -185,6 +187,7 @@ class LocalEmbeddingReceiver(AbstractEmbeddingReceiver): ...@@ -185,6 +187,7 @@ class LocalEmbeddingReceiver(AbstractEmbeddingReceiver):
self.received_tensors = {} self.received_tensors = {}
self.tensor_id_counter = 0 self.tensor_id_counter = 0
@_nvtx.annotate("mm:local:receive_embeddings", color="magenta")
async def receive_embeddings( async def receive_embeddings(
self, request: TransferRequest self, request: TransferRequest
) -> tuple[int, torch.Tensor]: ) -> tuple[int, torch.Tensor]:
...@@ -803,6 +806,7 @@ class NixlReadEmbeddingSender(AbstractEmbeddingSender): ...@@ -803,6 +806,7 @@ class NixlReadEmbeddingSender(AbstractEmbeddingSender):
def __init__(self): def __init__(self):
self.connector = PersistentConnector() self.connector = PersistentConnector()
@_nvtx.annotate("mm:nixl:send_embeddings", color="magenta")
async def send_embeddings( async def send_embeddings(
self, embeddings: torch.Tensor, stage_embeddings: bool = False self, embeddings: torch.Tensor, stage_embeddings: bool = False
) -> tuple[TransferRequest, asyncio.Future]: ) -> tuple[TransferRequest, asyncio.Future]:
...@@ -821,9 +825,10 @@ class NixlReadEmbeddingSender(AbstractEmbeddingSender): ...@@ -821,9 +825,10 @@ class NixlReadEmbeddingSender(AbstractEmbeddingSender):
transfer_buf = embeddings transfer_buf = embeddings
else: else:
transfer_buf = embeddings.clone().detach() transfer_buf = embeddings.clone().detach()
with _nvtx.annotate("mm:nixl:create_descriptor", color="pink"):
descriptor = nixl_connect.Descriptor(transfer_buf) descriptor = nixl_connect.Descriptor(transfer_buf)
with _nvtx.annotate("mm:nixl:create_readable", color="pink"):
readable_op = await self.connector.create_readable(descriptor) readable_op = await self.connector.create_readable(descriptor)
request = TransferRequest( request = TransferRequest(
embeddings_shape=list(embeddings.shape), embeddings_shape=list(embeddings.shape),
embedding_dtype_str=torch_dtype_to_string(embeddings.dtype), embedding_dtype_str=torch_dtype_to_string(embeddings.dtype),
...@@ -877,6 +882,7 @@ class NixlReadEmbeddingReceiver(AbstractEmbeddingReceiver): ...@@ -877,6 +882,7 @@ class NixlReadEmbeddingReceiver(AbstractEmbeddingReceiver):
descriptor.register_with_connector(connection) descriptor.register_with_connector(connection)
self.warmedup_descriptors.put(descriptor) self.warmedup_descriptors.put(descriptor)
@_nvtx.annotate("mm:nixl:receive_embeddings", color="magenta")
async def receive_embeddings( async def receive_embeddings(
self, request: TransferRequest self, request: TransferRequest
) -> tuple[int, torch.Tensor]: ) -> tuple[int, torch.Tensor]:
...@@ -918,8 +924,10 @@ class NixlReadEmbeddingReceiver(AbstractEmbeddingReceiver): ...@@ -918,8 +924,10 @@ class NixlReadEmbeddingReceiver(AbstractEmbeddingReceiver):
) )
dynamic_descriptor = False dynamic_descriptor = False
with _nvtx.annotate("mm:nixl:begin_read", color="pink"):
# Create read operation to read from EncodeHandler # Create read operation to read from EncodeHandler
read_op = await self.connector.begin_read(readable_metadata, descriptor) read_op = await self.connector.begin_read(readable_metadata, descriptor)
with _nvtx.annotate("mm:nixl:wait_completion", color="pink"):
# Wait for the read operation to complete # Wait for the read operation to complete
await read_op.wait_for_completion() await read_op.wait_for_completion()
logging.debug( logging.debug(
......
...@@ -25,6 +25,7 @@ import httpx ...@@ -25,6 +25,7 @@ import httpx
from PIL import Image from PIL import Image
import dynamo.nixl_connect as nixl_connect import dynamo.nixl_connect as nixl_connect
from dynamo.common.utils import nvtx_utils as _nvtx
from dynamo.common.utils.media_nixl import read_decoded_media_via_nixl from dynamo.common.utils.media_nixl import read_decoded_media_via_nixl
from .http_client import get_http_client from .http_client import get_http_client
...@@ -46,6 +47,7 @@ class ImageLoader: ...@@ -46,6 +47,7 @@ class ImageLoader:
self._image_cache: dict[str, Image.Image] = {} self._image_cache: dict[str, Image.Image] = {}
self._cache_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=cache_size) self._cache_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=cache_size)
@_nvtx.annotate("mm:img:load_image", color="lime")
async def load_image(self, image_url: str) -> Image.Image: async def load_image(self, image_url: str) -> Image.Image:
parsed_url = urlparse(image_url) parsed_url = urlparse(image_url)
...@@ -58,6 +60,7 @@ class ImageLoader: ...@@ -58,6 +60,7 @@ class ImageLoader:
try: try:
if parsed_url.scheme == "data": if parsed_url.scheme == "data":
with _nvtx.annotate("mm:img:base64_decode", color="lime"):
# Parse data URL format: data:[<media type>][;base64],<data> # Parse data URL format: data:[<media type>][;base64],<data>
if not parsed_url.path.startswith("image/"): if not parsed_url.path.startswith("image/"):
raise ValueError("Data URL must be an image type") raise ValueError("Data URL must be an image type")
...@@ -73,6 +76,7 @@ class ImageLoader: ...@@ -73,6 +76,7 @@ class ImageLoader:
except binascii.Error as e: except binascii.Error as e:
raise ValueError(f"Invalid base64 encoding: {e}") raise ValueError(f"Invalid base64 encoding: {e}")
elif parsed_url.scheme in ("http", "https"): elif parsed_url.scheme in ("http", "https"):
with _nvtx.annotate("mm:img:http_fetch", color="lime"):
http_client = get_http_client(self._http_timeout) http_client = get_http_client(self._http_timeout)
response = await http_client.get(image_url) response = await http_client.get(image_url)
...@@ -85,6 +89,7 @@ class ImageLoader: ...@@ -85,6 +89,7 @@ class ImageLoader:
else: else:
raise ValueError(f"Invalid image source scheme: {parsed_url.scheme}") raise ValueError(f"Invalid image source scheme: {parsed_url.scheme}")
with _nvtx.annotate("mm:img:pil_open_convert", color="lime"):
# PIL is sync, so offload to a thread to avoid blocking the event loop # PIL is sync, so offload to a thread to avoid blocking the event loop
# Restrict to supported formats to prevent PSD parsing (GHSA-cfh3-3jmp-rvhc) # Restrict to supported formats to prevent PSD parsing (GHSA-cfh3-3jmp-rvhc)
image = await asyncio.to_thread( image = await asyncio.to_thread(
......
...@@ -9,6 +9,7 @@ Dynamo backends and components. ...@@ -9,6 +9,7 @@ Dynamo backends and components.
Submodules: Submodules:
- endpoint_types: Endpoint type parsing utilities - endpoint_types: Endpoint type parsing utilities
- nvtx_utils: NVTX profiling wrappers (enable with DYN_NVTX=1; no-ops by default)
- otel_tracing: OpenTelemetry tracing header utilities - otel_tracing: OpenTelemetry tracing header utilities
- paths: Workspace directory detection and path utilities - paths: Workspace directory detection and path utilities
- prometheus: Prometheus metrics collection and logging utilities - prometheus: Prometheus metrics collection and logging utilities
...@@ -18,6 +19,7 @@ from dynamo.common.utils import ( ...@@ -18,6 +19,7 @@ from dynamo.common.utils import (
endpoint_types, endpoint_types,
engine_response, engine_response,
namespace, namespace,
nvtx_utils,
otel_tracing, otel_tracing,
paths, paths,
prometheus, prometheus,
...@@ -28,6 +30,7 @@ __all__ = [ ...@@ -28,6 +30,7 @@ __all__ = [
"endpoint_types", "endpoint_types",
"engine_response", "engine_response",
"namespace", "namespace",
"nvtx_utils",
"otel_tracing", "otel_tracing",
"paths", "paths",
"prometheus", "prometheus",
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Lightweight NVTX wrappers for Dynamo profiling.
Set DYN_NVTX=1 to enable markers; default is disabled (zero overhead).
Usage — same syntax as the bare nvtx module:
from dynamo.common.utils import nvtx_utils as _nvtx
# Manual range (needed when the range spans async yields or has conditional end)
rng = _nvtx.start_range("my:range", color="blue")
...
_nvtx.end_range(rng)
# Decorator — annotates an entire function or async generator
@_nvtx.annotate("my:func", color="green")
def my_func(): ...
@_nvtx.range_decorator("my:async_gen", color="green")
async def my_async_gen():
yield ...
# Context manager — annotates a block (works with await and yield inside)
with _nvtx.annotate("my:block", color="cyan"):
result = await some_coroutine()
When enabled, uses a named nvtx.Domain and pre-allocated EventAttributes
objects (cached lazily by (message, color)) so that repeated calls to
start_range incur only a single dict lookup — no object allocation
or domain cache lookups on the hot path.
"""
import functools
import inspect
import os
ENABLED: bool = bool(int(os.getenv("DYN_NVTX", "0")))
if ENABLED:
import nvtx as _nvtx_lib
# Named domain + pre-allocated EventAttributes: no per-call object
# allocation or domain cache lookups on the hot path.
_domain = _nvtx_lib.get_domain("dynamo")
_attr_cache: dict = {}
def _get_attr(message: str, color: str):
try:
return _attr_cache[message, color]
except KeyError:
attr = _domain.get_event_attributes(message=message, color=color)
_attr_cache[message, color] = attr
return attr
def start_range(message: str, color: str = "white"):
return _domain.start_range(_get_attr(message, color))
def end_range(rng) -> None:
_domain.end_range(rng)
# functools.partial so decorator and context-manager usage both land
# in the "dynamo" domain, keeping all markers in one nsys row.
annotate = functools.partial(_nvtx_lib.annotate, domain="dynamo")
def range_decorator(message: str, color: str = "white"):
"""Decorator that wraps an async generator function with an NVTX range.
Unlike annotate(), which only covers the synchronous setup before the
first yield, this wraps the full generator iteration in a single range.
"""
def decorator(func):
if inspect.isasyncgenfunction(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
rng = start_range(message, color)
try:
async for item in func(*args, **kwargs):
yield item
finally:
end_range(rng)
return wrapper
else:
@functools.wraps(func)
def wrapper(*args, **kwargs):
rng = start_range(message, color)
try:
return func(*args, **kwargs)
finally:
end_range(rng)
return wrapper
return decorator
else:
# Pure Python no-ops: no C extension calls, no string allocations.
# The ENV var is read once at import time — no per-call branch overhead.
def start_range(message: str, color: str = "white"): # type: ignore[misc]
return None
def end_range(rng) -> None: # type: ignore[misc]
pass
class _NoOpAnnotate:
"""No-op that works as both a decorator and a context manager."""
__slots__ = ()
def __call__(self, func):
return func
def __enter__(self):
return self
def __exit__(self, *args):
pass
_noop_annotate = _NoOpAnnotate()
def annotate(message: str = "", color: str = "white"): # type: ignore[misc]
return _noop_annotate
def range_decorator(message: str = "", color: str = "white"): # type: ignore[misc]
"""No-op decorator: returns the wrapped function unchanged."""
def decorator(func):
return func
return decorator
...@@ -18,6 +18,7 @@ from dynamo.common.multimodal import ( ...@@ -18,6 +18,7 @@ from dynamo.common.multimodal import (
NixlReadEmbeddingSender, NixlReadEmbeddingSender,
NixlWriteEmbeddingSender, NixlWriteEmbeddingSender,
) )
from dynamo.common.utils import nvtx_utils as _nvtx
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from ..constants import EmbeddingTransferMode from ..constants import EmbeddingTransferMode
...@@ -119,6 +120,7 @@ class EncodeWorkerHandler: ...@@ -119,6 +120,7 @@ class EncodeWorkerHandler:
self._connector = connect.Connector() self._connector = connect.Connector()
logger.info("Encode worker startup completed.") logger.info("Encode worker startup completed.")
@_nvtx.range_decorator("mm:encode_worker_generate", color="blue")
async def generate( async def generate(
self, request: vLLMMultimodalRequest, context self, request: vLLMMultimodalRequest, context
) -> AsyncIterator[str]: ) -> AsyncIterator[str]:
...@@ -144,6 +146,8 @@ class EncodeWorkerHandler: ...@@ -144,6 +146,8 @@ class EncodeWorkerHandler:
try: try:
time_start = time.perf_counter() time_start = time.perf_counter()
with _nvtx.annotate("mm:enc:cache_check", color="cyan"):
# Before batch process images, check cache first # Before batch process images, check cache first
need_encode_indexes = [] need_encode_indexes = []
embedding_lists = [None] * len(request.multimodal_inputs) embedding_lists = [None] * len(request.multimodal_inputs)
...@@ -151,11 +155,14 @@ class EncodeWorkerHandler: ...@@ -151,11 +155,14 @@ class EncodeWorkerHandler:
if not request.multimodal_inputs[idx].multimodal_input.image_url: if not request.multimodal_inputs[idx].multimodal_input.image_url:
raise ValueError("image_url is required for the encode worker.") raise ValueError("image_url is required for the encode worker.")
image_url = request.multimodal_inputs[idx].multimodal_input.image_url image_url = request.multimodal_inputs[
idx
].multimodal_input.image_url
# see if we have local cache # see if we have local cache
embedding_key = EmbeddingCache.generate_hash_key(image_url) embedding_key = EmbeddingCache.generate_hash_key(image_url)
if self.embedding_cache is not None and self.embedding_cache.has_key( if (
embedding_key self.embedding_cache is not None
and self.embedding_cache.has_key(embedding_key)
): ):
(image_grid_thw, embeddings) = self.embedding_cache.get( (image_grid_thw, embeddings) = self.embedding_cache.get(
embedding_key embedding_key
...@@ -168,6 +175,7 @@ class EncodeWorkerHandler: ...@@ -168,6 +175,7 @@ class EncodeWorkerHandler:
# keep track of key to avoid recompute of it # keep track of key to avoid recompute of it
need_encode_indexes.append((idx, embedding_key)) need_encode_indexes.append((idx, embedding_key))
with _nvtx.annotate("mm:enc:image_load", color="green"):
# Load and generate image tensors # Load and generate image tensors
image_tasks = [] image_tasks = []
image_to_load = [] image_to_load = []
...@@ -183,7 +191,9 @@ class EncodeWorkerHandler: ...@@ -183,7 +191,9 @@ class EncodeWorkerHandler:
for i, result in enumerate(results): for i, result in enumerate(results):
if isinstance(result, Exception): if isinstance(result, Exception):
url = image_to_load[i] url = image_to_load[i]
logger.error(f"Failed to load image from {url[:80]}...: {result}") logger.error(
f"Failed to load image from {url[:80]}...: {result}"
)
collective_exceptions += ( collective_exceptions += (
f"Failed to load image from {url[:80]}...: {result}\n" f"Failed to load image from {url[:80]}...: {result}\n"
) )
...@@ -195,10 +205,12 @@ class EncodeWorkerHandler: ...@@ -195,10 +205,12 @@ class EncodeWorkerHandler:
) )
if loaded_images: if loaded_images:
with _nvtx.annotate("mm:enc:image_preprocess", color="yellow"):
image_embeds = await asyncio.to_thread( image_embeds = await asyncio.to_thread(
self.image_processor, images=loaded_images, return_tensors="pt" self.image_processor, images=loaded_images, return_tensors="pt"
) )
with _nvtx.annotate("mm:enc:vision_encode", color="red"):
# Encode the image embeddings using model-specific encoder # Encode the image embeddings using model-specific encoder
embeddings = await asyncio.to_thread( embeddings = await asyncio.to_thread(
encode_image_embeddings, encode_image_embeddings,
...@@ -208,6 +220,7 @@ class EncodeWorkerHandler: ...@@ -208,6 +220,7 @@ class EncodeWorkerHandler:
projector=self.projector, projector=self.projector,
) )
with _nvtx.annotate("mm:enc:split_embeddings", color="orange"):
# [gluo FIXME] This is specific to qwen vision processing.. # [gluo FIXME] This is specific to qwen vision processing..
# Split concatenated embeddings for each image item. # Split concatenated embeddings for each image item.
if is_qwen_vl_model(self.model): if is_qwen_vl_model(self.model):
...@@ -253,6 +266,7 @@ class EncodeWorkerHandler: ...@@ -253,6 +266,7 @@ class EncodeWorkerHandler:
before_transfer_time = time.perf_counter() before_transfer_time = time.perf_counter()
with _nvtx.annotate("mm:enc:embedding_transfer", color="purple"):
# Prepare transfer # Prepare transfer
send_tasks = [ send_tasks = [
asyncio.create_task( asyncio.create_task(
...@@ -279,7 +293,9 @@ class EncodeWorkerHandler: ...@@ -279,7 +293,9 @@ class EncodeWorkerHandler:
request.multimodal_inputs[idx].embeddings_shape = tuple( request.multimodal_inputs[idx].embeddings_shape = tuple(
embedding_item.embeddings.shape embedding_item.embeddings.shape
) )
request.multimodal_inputs[idx].serialized_request = transfer_request[0] request.multimodal_inputs[
idx
].serialized_request = transfer_request[0]
# Keep a reference of the embedding and only drop reference when the transfer is done # Keep a reference of the embedding and only drop reference when the transfer is done
self.send_complete_queue.put_nowait( self.send_complete_queue.put_nowait(
......
...@@ -20,6 +20,7 @@ from dynamo.common.multimodal.embedding_transfer import ( ...@@ -20,6 +20,7 @@ from dynamo.common.multimodal.embedding_transfer import (
NixlReadEmbeddingReceiver, NixlReadEmbeddingReceiver,
NixlWriteEmbeddingReceiver, NixlWriteEmbeddingReceiver,
) )
from dynamo.common.utils import nvtx_utils as _nvtx
from dynamo.runtime import Client, DistributedRuntime from dynamo.runtime import Client, DistributedRuntime
from ..args import Config from ..args import Config
...@@ -257,6 +258,7 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler): ...@@ -257,6 +258,7 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
self, self,
request: vLLMMultimodalRequest, request: vLLMMultimodalRequest,
multi_modal_data: dict[str, Any], multi_modal_data: dict[str, Any],
rng_ttft=None,
): ):
"""Run prefill and decode on this worker (aggregated mode).""" """Run prefill and decode on this worker (aggregated mode)."""
lora_request = self._resolve_lora_request(request.model) lora_request = self._resolve_lora_request(request.model)
...@@ -271,14 +273,26 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler): ...@@ -271,14 +273,26 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
) )
num_output_tokens_so_far = 0 num_output_tokens_so_far = 0
first_token = True
try:
async for response in gen: async for response in gen:
logger.debug(f"Response kv_transfer_params: {response.kv_transfer_params}") if first_token:
if rng_ttft is not None:
_nvtx.end_range(rng_ttft)
first_token = False
logger.debug(
f"Response kv_transfer_params: {response.kv_transfer_params}"
)
logger.debug( logger.debug(
f"length of expanded prompt ids: {len(response.prompt_token_ids)}" f"length of expanded prompt ids: {len(response.prompt_token_ids)}"
) )
yield self._format_engine_output(response, num_output_tokens_so_far) yield self._format_engine_output(response, num_output_tokens_so_far)
if response.outputs: if response.outputs:
num_output_tokens_so_far = len(response.outputs[0].token_ids) num_output_tokens_so_far = len(response.outputs[0].token_ids)
finally:
if first_token:
if rng_ttft is not None:
_nvtx.end_range(rng_ttft)
# ── Disaggregated generation (prefill here, decode remote) ─────── # ── Disaggregated generation (prefill here, decode remote) ───────
...@@ -286,6 +300,7 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler): ...@@ -286,6 +300,7 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
self, self,
request: vLLMMultimodalRequest, request: vLLMMultimodalRequest,
multi_modal_data: dict[str, Any], multi_modal_data: dict[str, Any],
rng_ttft=None,
): ):
"""Prefill locally, then forward to a remote decode worker.""" """Prefill locally, then forward to a remote decode worker."""
# Prepare prefill-only request # Prepare prefill-only request
...@@ -298,9 +313,12 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler): ...@@ -298,9 +313,12 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
logger.debug("Prefill request: %s", prefill_only_request) logger.debug("Prefill request: %s", prefill_only_request)
lora_request = self._resolve_lora_request(request.model) lora_request = self._resolve_lora_request(request.model)
with _nvtx.annotate("mm:pd:disagg_prefill", color="darkred"):
gen = self.engine_client.generate( gen = self.engine_client.generate(
prompt=TokensPrompt( prompt=TokensPrompt(
prompt_token_ids=prefill_only_request.engine_prompt["prompt_token_ids"], prompt_token_ids=prefill_only_request.engine_prompt[
"prompt_token_ids"
],
multi_modal_data=multi_modal_data, multi_modal_data=multi_modal_data,
), ),
sampling_params=prefill_only_request.sampling_params, sampling_params=prefill_only_request.sampling_params,
...@@ -311,6 +329,8 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler): ...@@ -311,6 +329,8 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
# Drain prefill generator (max_tokens=1, expect a single response) # Drain prefill generator (max_tokens=1, expect a single response)
async for prefill_response in gen: async for prefill_response in gen:
pass pass
if rng_ttft is not None:
_nvtx.end_range(rng_ttft)
# Qwen VL (mRoPE): keep the ORIGINAL unexpanded prompt. # Qwen VL (mRoPE): keep the ORIGINAL unexpanded prompt.
# The decode worker passes multi_modal_data which causes vLLM to # The decode worker passes multi_modal_data which causes vLLM to
...@@ -347,6 +367,7 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler): ...@@ -347,6 +367,7 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
f"— ensure the same adapter is loaded on the decode worker." f"— ensure the same adapter is loaded on the decode worker."
) )
with _nvtx.annotate("mm:pd:disagg_remote_decode", color="purple"):
num_output_tokens_so_far = 0 num_output_tokens_so_far = 0
async for ( async for (
decode_response decode_response
...@@ -362,17 +383,33 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler): ...@@ -362,17 +383,33 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
async def generate(self, raw_request: dict, context): async def generate(self, raw_request: dict, context):
"""Parse the request, load multimodal data, and run inference.""" """Parse the request, load multimodal data, and run inference."""
rng_pd = _nvtx.start_range("mm:pd_worker_generate", color="green")
rng_ttft = _nvtx.start_range("mm:pd:ttft", color="orange")
rng_parse = _nvtx.start_range("mm:pd:parse_request", color="cyan")
request, image_urls = self._parse_frontend_request(raw_request) request, image_urls = self._parse_frontend_request(raw_request)
logger.debug(f"Received PD request: {{ id: {request.request_id} }}.") logger.debug(f"Received PD request: {{ id: {request.request_id} }}.")
_nvtx.end_range(rng_parse)
rng_load = _nvtx.start_range("mm:pd:load_multimodal", color="yellow")
multi_modal_data = await self._load_multimodal_data( multi_modal_data = await self._load_multimodal_data(
image_urls, request.request_id image_urls, request.request_id
) )
_nvtx.end_range(rng_load)
self._finalize_request_metadata(request, multi_modal_data) self._finalize_request_metadata(request, multi_modal_data)
if self.enable_disagg and self.decode_worker_client: if self.enable_disagg and self.decode_worker_client:
async for chunk in self._generate_disagg(request, multi_modal_data): rng_disagg = _nvtx.start_range("mm:pd:generate_disagg", color="red")
async for chunk in self._generate_disagg(
request, multi_modal_data, rng_ttft
):
yield chunk yield chunk
_nvtx.end_range(rng_disagg)
else: else:
async for chunk in self._generate_agg(request, multi_modal_data): rng_agg = _nvtx.start_range("mm:pd:generate_agg", color="red")
async for chunk in self._generate_agg(request, multi_modal_data, rng_ttft):
yield chunk yield chunk
_nvtx.end_range(rng_agg)
_nvtx.end_range(rng_pd)
...@@ -6,6 +6,7 @@ import logging ...@@ -6,6 +6,7 @@ import logging
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
import dynamo.nixl_connect as connect import dynamo.nixl_connect as connect
from dynamo.common.utils import nvtx_utils as _nvtx
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from ..args import Config from ..args import Config
...@@ -53,6 +54,7 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler): ...@@ -53,6 +54,7 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
logger.info("Multimodal Decode Worker async initialization completed.") logger.info("Multimodal Decode Worker async initialization completed.")
async def generate(self, request: vLLMMultimodalRequest, context): async def generate(self, request: vLLMMultimodalRequest, context):
rng_decode = _nvtx.start_range("mm:decode_worker_generate", color="blue")
logger.debug(f"Got raw request: {request}") logger.debug(f"Got raw request: {request}")
if not isinstance(request, vLLMMultimodalRequest): if not isinstance(request, vLLMMultimodalRequest):
if isinstance(request, str): if isinstance(request, str):
...@@ -95,8 +97,16 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler): ...@@ -95,8 +97,16 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
lora_request=lora_request, lora_request=lora_request,
) )
rng_first = _nvtx.start_range("mm:decode:first_token", color="darkred")
first_token = True
try:
async for response in gen: async for response in gen:
logger.debug(f"Response kv_transfer_params: {response.kv_transfer_params}") if first_token:
_nvtx.end_range(rng_first)
first_token = False
logger.debug(
f"Response kv_transfer_params: {response.kv_transfer_params}"
)
yield MyRequestOutput( yield MyRequestOutput(
request_id=response.request_id, request_id=response.request_id,
prompt=response.prompt, prompt=response.prompt,
...@@ -107,3 +117,7 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler): ...@@ -107,3 +117,7 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
metrics=response.metrics, metrics=response.metrics,
kv_transfer_params=response.kv_transfer_params, kv_transfer_params=response.kv_transfer_params,
).model_dump_json() ).model_dump_json()
finally:
if first_token:
_nvtx.end_range(rng_first)
_nvtx.end_range(rng_decode)
...@@ -30,6 +30,7 @@ msgpack==1.1.2 ...@@ -30,6 +30,7 @@ msgpack==1.1.2
msgspec==0.19.0 msgspec==0.19.0
mypy==1.18.2 mypy==1.18.2
nvidia-ml-py<=13.580.65 # NVIDIA/CUDA related, may vary by driver version nvidia-ml-py<=13.580.65 # NVIDIA/CUDA related, may vary by driver version
nvtx==0.2.14
opentelemetry-api<=1.38.0 # May need to stay in sync with other components opentelemetry-api<=1.38.0 # May need to stay in sync with other components
opentelemetry-exporter-otlp<=1.38.0 # May need to stay in sync with other components opentelemetry-exporter-otlp<=1.38.0 # May need to stay in sync with other components
opentelemetry-sdk<=1.38.0 # May need to stay in sync with other components opentelemetry-sdk<=1.38.0 # May need to stay in sync with other components
......
...@@ -611,6 +611,38 @@ curl -X POST http://<decode-worker>/load_lora \ ...@@ -611,6 +611,38 @@ curl -X POST http://<decode-worker>/load_lora \
If a LoRA is loaded on the prefill worker but not on the decode worker, the decode worker will fall back to the base model for that request. If a LoRA is loaded on the prefill worker but not on the decode worker, the decode worker will fall back to the base model for that request.
## Profiling
Dynamo's multimodal workers include NVTX markers for `nsys` profiling. They are disabled by default (zero overhead) and enabled by setting `DYN_NVTX=1`.
```bash
cd $DYNAMO_HOME/examples/backends/vllm
DYN_NVTX=1 nsys profile --trace=cuda,nvtx -o profile.nsys-rep \
bash launch/agg_multimodal.sh ...
```
| ENV Variable | Default | Description |
|---|---|---|
| `DYN_NVTX` | `0` | Set to `1` to enable NVTX range/mark annotations in encode, prefill, and decode workers for `nsys` profiling |
Key NVTX ranges emitted:
| Range | Worker | Description |
|-------|--------|-------------|
| `mm:encode_worker_generate` | Encode | Full encode request lifetime |
| `mm:enc:cache_check` | Encode | Embedding cache lookup |
| `mm:enc:image_load` | Encode | Image download/load |
| `mm:enc:image_preprocess` | Encode | Image processor (CPU) |
| `mm:enc:vision_encode` | Encode | ViT + projector GPU forward |
| `mm:enc:embedding_transfer` | Encode | RDMA embedding staging |
| `mm:pd_worker_generate` | PD | Full PD request lifetime |
| `mm:pd:ttft` | PD | Worker-side TTFT: from request arrival at the PD worker to first output token (excludes client→frontend→worker network transit) |
| `mm:pd:load_multimodal` | PD | Fetch embeddings from encode worker |
| `mm:pd:disagg_prefill` | PD (disagg) | Prefill-only engine call |
| `mm:pd:disagg_remote_decode` | PD (disagg) | Remote decode round-trip |
| `mm:decode_worker_generate` | Decode | Full decode request lifetime |
| `mm:decode:first_token` | Decode | Time to first output token |
## Known Limitations ## Known Limitations
- **Disaggregated flows require Python Processor** - All multimodal disaggregation requires the Python Processor component (`ModelInput.Text`). - **Disaggregated flows require Python Processor** - All multimodal disaggregation requires the Python Processor component (`ModelInput.Text`).
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment