Unverified Commit 325ab6b0 authored by emricksini-h's avatar emricksini-h Committed by GitHub
Browse files

[Feature] OTEL tracing during loading (#31162)

parent 91a07ff6
......@@ -100,8 +100,8 @@ class LLMEngine:
)
endpoint = self.observability_config.otlp_traces_endpoint
if endpoint is not None:
tracer = init_tracer("vllm.llm_engine", endpoint)
self.output_processor.tracer = tracer
init_tracer("vllm.llm_engine", endpoint)
self.output_processor.tracing_enabled = True
# EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
self.engine_core = EngineCoreClient.make_client(
......
......@@ -20,7 +20,12 @@ from vllm.outputs import (
)
from vllm.sampling_params import RequestOutputKind
from vllm.tokenizers import TokenizerLike
from vllm.tracing import SpanAttributes, SpanKind, Tracer, extract_trace_context
from vllm.tracing import (
SpanAttributes,
SpanKind,
extract_trace_context,
instrument_manual,
)
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
from vllm.v1.engine.detokenizer import IncrementalDetokenizer
......@@ -422,7 +427,7 @@ class OutputProcessor:
self.parent_requests: dict[str, ParentRequest] = {}
self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
self.lora_states = LoRARequestStates(log_stats)
self.tracer: Tracer | None = None
self.tracing_enabled: bool = False
self._requests_drained = asyncio.Event()
self._requests_drained.set()
......@@ -678,7 +683,7 @@ class OutputProcessor:
self._update_stats_from_finished(
req_state, finish_reason, iteration_stats
)
if self.tracer:
if self.tracing_enabled:
self.do_tracing(engine_core_output, req_state, iteration_stats)
return OutputProcessorOutput(
......@@ -714,62 +719,59 @@ class OutputProcessor:
) -> None:
assert req_state.stats is not None
assert iteration_stats is not None
assert self.tracer is not None
arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
metrics = req_state.stats
arrival_time_ns = int(metrics.arrival_time * 1e9)
trace_context = extract_trace_context(engine_core_output.trace_headers)
prompt_length = length_from_prompt_token_ids_or_embeds(
req_state.prompt_token_ids, req_state.prompt_embeds
)
with self.tracer.start_as_current_span(
"llm_request",
kind=SpanKind.SERVER,
context=trace_context,
start_time=arrival_time_nano_seconds,
) as span:
metrics = req_state.stats
e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
queued_time = metrics.scheduled_ts - metrics.queued_ts
prefill_time = metrics.first_token_ts - metrics.scheduled_ts
decode_time = metrics.last_token_ts - metrics.first_token_ts
inference_time = metrics.last_token_ts - metrics.scheduled_ts
span.set_attribute(
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
metrics.first_token_latency,
)
span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time)
span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, prompt_length)
span.set_attribute(
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
metrics.num_generation_tokens,
)
span.set_attribute(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, prefill_time
)
span.set_attribute(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, decode_time
# Calculate timing metrics
e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
queued_time = metrics.scheduled_ts - metrics.queued_ts
prefill_time = metrics.first_token_ts - metrics.scheduled_ts
decode_time = metrics.last_token_ts - metrics.first_token_ts
inference_time = metrics.last_token_ts - metrics.scheduled_ts
# Build attributes dict
attributes: dict[str, Any] = {
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN: (
metrics.first_token_latency
),
SpanAttributes.GEN_AI_LATENCY_E2E: e2e_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE: queued_time,
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS: prompt_length,
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS: (
metrics.num_generation_tokens
),
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL: prefill_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE: decode_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE: inference_time,
SpanAttributes.GEN_AI_REQUEST_ID: req_state.external_req_id,
}
# Add optional request parameters
if req_state.top_p:
attributes[SpanAttributes.GEN_AI_REQUEST_TOP_P] = req_state.top_p
if req_state.max_tokens_param:
attributes[SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS] = (
req_state.max_tokens_param
)
span.set_attribute(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, inference_time
if req_state.temperature:
attributes[SpanAttributes.GEN_AI_REQUEST_TEMPERATURE] = (
req_state.temperature
)
if req_state.n:
attributes[SpanAttributes.GEN_AI_REQUEST_N] = req_state.n
# meta
span.set_attribute(
SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id
)
if req_state.top_p:
span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
if req_state.max_tokens_param:
span.set_attribute(
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, req_state.max_tokens_param
)
if req_state.temperature:
span.set_attribute(
SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, req_state.temperature
)
if req_state.n:
span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.n)
instrument_manual(
span_name="llm_request",
start_time=arrival_time_ns,
attributes=attributes,
context=trace_context,
kind=SpanKind.SERVER,
)
def _update_stats_from_output(
self,
......
......@@ -15,6 +15,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.tasks import SupportedTask
from vllm.tracing import instrument
from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest
......@@ -84,6 +85,7 @@ class Executor(ABC):
)
return executor_class
@instrument(span_name="Executor init")
def __init__(
self,
vllm_config: VllmConfig,
......
......@@ -41,6 +41,7 @@ from vllm.distributed.parallel_state import (
)
from vllm.envs import enable_envs_cache
from vllm.logger import init_logger
from vllm.tracing import instrument, maybe_init_worker_tracer
from vllm.utils.network_utils import (
get_distributed_init_method,
get_loopback_ip,
......@@ -527,6 +528,7 @@ class WorkerProc:
)
)
@instrument(span_name="Worker init")
def __init__(
self,
vllm_config: VllmConfig,
......@@ -740,6 +742,15 @@ class WorkerProc:
try:
reader.close()
# Initialize tracer
rank = kwargs.get("rank", 0)
maybe_init_worker_tracer(
instrumenting_module_name="vllm.worker",
process_kind="worker",
process_name=f"Worker_{rank}",
)
worker = WorkerProc(*args, **kwargs)
assert worker.worker_response_mq is not None
......
......@@ -9,6 +9,7 @@ import torch.nn as nn
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model
from vllm.tracing import instrument
from vllm.v1.utils import CpuGpuBuffer
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
......@@ -51,6 +52,7 @@ class CPUModelRunner(GPUModelRunner):
if isinstance(v, CpuGpuBuffer):
v.gpu = v.cpu
@instrument(span_name="Loading (CPU)")
def load_model(self, eep_scale_up: bool = False) -> None:
logger.info("Starting to load model %s...", self.model_config.model)
self.model = get_model(vllm_config=self.vllm_config)
......@@ -61,6 +63,7 @@ class CPUModelRunner(GPUModelRunner):
def get_model(self) -> nn.Module:
return self.model
@instrument(span_name="Warmup (CPU)")
def warming_up_model(self) -> None:
logger.info("Warming up model for the compilation...")
# Only generate graph for the generic shape
......
......@@ -93,6 +93,7 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.tracing import instrument
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.utils.jsontree import json_map_leaves
from vllm.utils.math_utils import cdiv, round_up
......@@ -4111,6 +4112,7 @@ class GPUModelRunner(
new_config = update_config(config, config_overrides)
setattr(self, config_name, new_config)
@instrument(span_name="Loading (GPU)")
def load_model(self, eep_scale_up: bool = False) -> None:
"""
Args:
......@@ -5165,6 +5167,7 @@ class GPUModelRunner(
self.encoder_cache.clear()
gc.collect()
@instrument(span_name="Capture model")
def capture_model(self) -> int:
if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
logger.warning(
......
......@@ -42,6 +42,7 @@ from vllm.platforms import current_platform
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask
from vllm.tracing import instrument
from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
......@@ -186,6 +187,7 @@ class Worker(WorkerBase):
self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
@instrument(span_name="Init device")
def init_device(self):
if self.device_config.device_type == "cuda":
# This env var set by Ray causes exceptions with graph building.
......@@ -407,6 +409,7 @@ class Worker(WorkerBase):
self.model_runner.update_max_model_len(max_model_len)
logger.debug("Updated max_model_len to %d", max_model_len)
@instrument(span_name="Allocate KV cache")
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
"""Allocate GPU KV cache with the specified kv_cache_config."""
......@@ -426,6 +429,7 @@ class Worker(WorkerBase):
else:
self.model_runner.initialize_kv_cache(kv_cache_config)
@instrument(span_name="Warmup (GPU)")
def compile_or_warm_up_model(self) -> None:
warmup_sizes = []
......
......@@ -11,6 +11,7 @@ from vllm.config import VllmConfig, set_current_vllm_config
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.tracing import instrument
from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.utils.system_utils import update_environment_variables
from vllm.v1.kv_cache_interface import KVCacheSpec
......@@ -222,6 +223,7 @@ class WorkerWrapperBase:
envs = envs_list[self.rpc_rank]
update_environment_variables(envs)
@instrument(span_name="Worker init")
def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None:
"""
Here we inject some common logic before initializing the worker.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment