Unverified Commit 325ab6b0 authored by emricksini-h's avatar emricksini-h Committed by GitHub
Browse files

[Feature] OTEL tracing during loading (#31162)

parent 91a07ff6
...@@ -100,8 +100,8 @@ class LLMEngine: ...@@ -100,8 +100,8 @@ class LLMEngine:
) )
endpoint = self.observability_config.otlp_traces_endpoint endpoint = self.observability_config.otlp_traces_endpoint
if endpoint is not None: if endpoint is not None:
tracer = init_tracer("vllm.llm_engine", endpoint) init_tracer("vllm.llm_engine", endpoint)
self.output_processor.tracer = tracer self.output_processor.tracing_enabled = True
# EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
self.engine_core = EngineCoreClient.make_client( self.engine_core = EngineCoreClient.make_client(
......
...@@ -20,7 +20,12 @@ from vllm.outputs import ( ...@@ -20,7 +20,12 @@ from vllm.outputs import (
) )
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from vllm.tracing import SpanAttributes, SpanKind, Tracer, extract_trace_context from vllm.tracing import (
SpanAttributes,
SpanKind,
extract_trace_context,
instrument_manual,
)
from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
from vllm.v1.engine.detokenizer import IncrementalDetokenizer from vllm.v1.engine.detokenizer import IncrementalDetokenizer
...@@ -422,7 +427,7 @@ class OutputProcessor: ...@@ -422,7 +427,7 @@ class OutputProcessor:
self.parent_requests: dict[str, ParentRequest] = {} self.parent_requests: dict[str, ParentRequest] = {}
self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list) self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
self.lora_states = LoRARequestStates(log_stats) self.lora_states = LoRARequestStates(log_stats)
self.tracer: Tracer | None = None self.tracing_enabled: bool = False
self._requests_drained = asyncio.Event() self._requests_drained = asyncio.Event()
self._requests_drained.set() self._requests_drained.set()
...@@ -678,7 +683,7 @@ class OutputProcessor: ...@@ -678,7 +683,7 @@ class OutputProcessor:
self._update_stats_from_finished( self._update_stats_from_finished(
req_state, finish_reason, iteration_stats req_state, finish_reason, iteration_stats
) )
if self.tracer: if self.tracing_enabled:
self.do_tracing(engine_core_output, req_state, iteration_stats) self.do_tracing(engine_core_output, req_state, iteration_stats)
return OutputProcessorOutput( return OutputProcessorOutput(
...@@ -714,62 +719,59 @@ class OutputProcessor: ...@@ -714,62 +719,59 @@ class OutputProcessor:
) -> None: ) -> None:
assert req_state.stats is not None assert req_state.stats is not None
assert iteration_stats is not None assert iteration_stats is not None
assert self.tracer is not None
arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9) metrics = req_state.stats
arrival_time_ns = int(metrics.arrival_time * 1e9)
trace_context = extract_trace_context(engine_core_output.trace_headers) trace_context = extract_trace_context(engine_core_output.trace_headers)
prompt_length = length_from_prompt_token_ids_or_embeds( prompt_length = length_from_prompt_token_ids_or_embeds(
req_state.prompt_token_ids, req_state.prompt_embeds req_state.prompt_token_ids, req_state.prompt_embeds
) )
with self.tracer.start_as_current_span(
"llm_request", # Calculate timing metrics
kind=SpanKind.SERVER,
context=trace_context,
start_time=arrival_time_nano_seconds,
) as span:
metrics = req_state.stats
e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
queued_time = metrics.scheduled_ts - metrics.queued_ts queued_time = metrics.scheduled_ts - metrics.queued_ts
prefill_time = metrics.first_token_ts - metrics.scheduled_ts prefill_time = metrics.first_token_ts - metrics.scheduled_ts
decode_time = metrics.last_token_ts - metrics.first_token_ts decode_time = metrics.last_token_ts - metrics.first_token_ts
inference_time = metrics.last_token_ts - metrics.scheduled_ts inference_time = metrics.last_token_ts - metrics.scheduled_ts
span.set_attribute(
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
metrics.first_token_latency,
)
span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time)
span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, prompt_length)
span.set_attribute(
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
metrics.num_generation_tokens,
)
span.set_attribute(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, prefill_time
)
span.set_attribute(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, decode_time
)
span.set_attribute(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, inference_time
)
# meta # Build attributes dict
span.set_attribute( attributes: dict[str, Any] = {
SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN: (
) metrics.first_token_latency
),
SpanAttributes.GEN_AI_LATENCY_E2E: e2e_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE: queued_time,
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS: prompt_length,
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS: (
metrics.num_generation_tokens
),
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL: prefill_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE: decode_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE: inference_time,
SpanAttributes.GEN_AI_REQUEST_ID: req_state.external_req_id,
}
# Add optional request parameters
if req_state.top_p: if req_state.top_p:
span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p) attributes[SpanAttributes.GEN_AI_REQUEST_TOP_P] = req_state.top_p
if req_state.max_tokens_param: if req_state.max_tokens_param:
span.set_attribute( attributes[SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS] = (
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, req_state.max_tokens_param req_state.max_tokens_param
) )
if req_state.temperature: if req_state.temperature:
span.set_attribute( attributes[SpanAttributes.GEN_AI_REQUEST_TEMPERATURE] = (
SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, req_state.temperature req_state.temperature
) )
if req_state.n: if req_state.n:
span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.n) attributes[SpanAttributes.GEN_AI_REQUEST_N] = req_state.n
instrument_manual(
span_name="llm_request",
start_time=arrival_time_ns,
attributes=attributes,
context=trace_context,
kind=SpanKind.SERVER,
)
def _update_stats_from_output( def _update_stats_from_output(
self, self,
......
...@@ -15,6 +15,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( ...@@ -15,6 +15,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.tracing import instrument
from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest from vllm.v1.engine import ReconfigureDistributedRequest
...@@ -84,6 +85,7 @@ class Executor(ABC): ...@@ -84,6 +85,7 @@ class Executor(ABC):
) )
return executor_class return executor_class
@instrument(span_name="Executor init")
def __init__( def __init__(
self, self,
vllm_config: VllmConfig, vllm_config: VllmConfig,
......
...@@ -41,6 +41,7 @@ from vllm.distributed.parallel_state import ( ...@@ -41,6 +41,7 @@ from vllm.distributed.parallel_state import (
) )
from vllm.envs import enable_envs_cache from vllm.envs import enable_envs_cache
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.tracing import instrument, maybe_init_worker_tracer
from vllm.utils.network_utils import ( from vllm.utils.network_utils import (
get_distributed_init_method, get_distributed_init_method,
get_loopback_ip, get_loopback_ip,
...@@ -527,6 +528,7 @@ class WorkerProc: ...@@ -527,6 +528,7 @@ class WorkerProc:
) )
) )
@instrument(span_name="Worker init")
def __init__( def __init__(
self, self,
vllm_config: VllmConfig, vllm_config: VllmConfig,
...@@ -740,6 +742,15 @@ class WorkerProc: ...@@ -740,6 +742,15 @@ class WorkerProc:
try: try:
reader.close() reader.close()
# Initialize tracer
rank = kwargs.get("rank", 0)
maybe_init_worker_tracer(
instrumenting_module_name="vllm.worker",
process_kind="worker",
process_name=f"Worker_{rank}",
)
worker = WorkerProc(*args, **kwargs) worker = WorkerProc(*args, **kwargs)
assert worker.worker_response_mq is not None assert worker.worker_response_mq is not None
......
...@@ -9,6 +9,7 @@ import torch.nn as nn ...@@ -9,6 +9,7 @@ import torch.nn as nn
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.tracing import instrument
from vllm.v1.utils import CpuGpuBuffer from vllm.v1.utils import CpuGpuBuffer
from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.gpu_model_runner import GPUModelRunner
...@@ -51,6 +52,7 @@ class CPUModelRunner(GPUModelRunner): ...@@ -51,6 +52,7 @@ class CPUModelRunner(GPUModelRunner):
if isinstance(v, CpuGpuBuffer): if isinstance(v, CpuGpuBuffer):
v.gpu = v.cpu v.gpu = v.cpu
@instrument(span_name="Loading (CPU)")
def load_model(self, eep_scale_up: bool = False) -> None: def load_model(self, eep_scale_up: bool = False) -> None:
logger.info("Starting to load model %s...", self.model_config.model) logger.info("Starting to load model %s...", self.model_config.model)
self.model = get_model(vllm_config=self.vllm_config) self.model = get_model(vllm_config=self.vllm_config)
...@@ -61,6 +63,7 @@ class CPUModelRunner(GPUModelRunner): ...@@ -61,6 +63,7 @@ class CPUModelRunner(GPUModelRunner):
def get_model(self) -> nn.Module: def get_model(self) -> nn.Module:
return self.model return self.model
@instrument(span_name="Warmup (CPU)")
def warming_up_model(self) -> None: def warming_up_model(self) -> None:
logger.info("Warming up model for the compilation...") logger.info("Warming up model for the compilation...")
# Only generate graph for the generic shape # Only generate graph for the generic shape
......
...@@ -93,6 +93,7 @@ from vllm.pooling_params import PoolingParams ...@@ -93,6 +93,7 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingType from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.tracing import instrument
from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.utils.jsontree import json_map_leaves from vllm.utils.jsontree import json_map_leaves
from vllm.utils.math_utils import cdiv, round_up from vllm.utils.math_utils import cdiv, round_up
...@@ -4111,6 +4112,7 @@ class GPUModelRunner( ...@@ -4111,6 +4112,7 @@ class GPUModelRunner(
new_config = update_config(config, config_overrides) new_config = update_config(config, config_overrides)
setattr(self, config_name, new_config) setattr(self, config_name, new_config)
@instrument(span_name="Loading (GPU)")
def load_model(self, eep_scale_up: bool = False) -> None: def load_model(self, eep_scale_up: bool = False) -> None:
""" """
Args: Args:
...@@ -5165,6 +5167,7 @@ class GPUModelRunner( ...@@ -5165,6 +5167,7 @@ class GPUModelRunner(
self.encoder_cache.clear() self.encoder_cache.clear()
gc.collect() gc.collect()
@instrument(span_name="Capture model")
def capture_model(self) -> int: def capture_model(self) -> int:
if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
logger.warning( logger.warning(
......
...@@ -42,6 +42,7 @@ from vllm.platforms import current_platform ...@@ -42,6 +42,7 @@ from vllm.platforms import current_platform
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.tracing import instrument
from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
...@@ -186,6 +187,7 @@ class Worker(WorkerBase): ...@@ -186,6 +187,7 @@ class Worker(WorkerBase):
self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks
@instrument(span_name="Init device")
def init_device(self): def init_device(self):
if self.device_config.device_type == "cuda": if self.device_config.device_type == "cuda":
# This env var set by Ray causes exceptions with graph building. # This env var set by Ray causes exceptions with graph building.
...@@ -407,6 +409,7 @@ class Worker(WorkerBase): ...@@ -407,6 +409,7 @@ class Worker(WorkerBase):
self.model_runner.update_max_model_len(max_model_len) self.model_runner.update_max_model_len(max_model_len)
logger.debug("Updated max_model_len to %d", max_model_len) logger.debug("Updated max_model_len to %d", max_model_len)
@instrument(span_name="Allocate KV cache")
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
"""Allocate GPU KV cache with the specified kv_cache_config.""" """Allocate GPU KV cache with the specified kv_cache_config."""
...@@ -426,6 +429,7 @@ class Worker(WorkerBase): ...@@ -426,6 +429,7 @@ class Worker(WorkerBase):
else: else:
self.model_runner.initialize_kv_cache(kv_cache_config) self.model_runner.initialize_kv_cache(kv_cache_config)
@instrument(span_name="Warmup (GPU)")
def compile_or_warm_up_model(self) -> None: def compile_or_warm_up_model(self) -> None:
warmup_sizes = [] warmup_sizes = []
......
...@@ -11,6 +11,7 @@ from vllm.config import VllmConfig, set_current_vllm_config ...@@ -11,6 +11,7 @@ from vllm.config import VllmConfig, set_current_vllm_config
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.tracing import instrument
from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
from vllm.v1.kv_cache_interface import KVCacheSpec from vllm.v1.kv_cache_interface import KVCacheSpec
...@@ -222,6 +223,7 @@ class WorkerWrapperBase: ...@@ -222,6 +223,7 @@ class WorkerWrapperBase:
envs = envs_list[self.rpc_rank] envs = envs_list[self.rpc_rank]
update_environment_variables(envs) update_environment_variables(envs)
@instrument(span_name="Worker init")
def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None: def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None:
""" """
Here we inject some common logic before initializing the worker. Here we inject some common logic before initializing the worker.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment