Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
347eeebe
Unverified
Commit
347eeebe
authored
Jan 21, 2025
by
Adrian Cole
Committed by
GitHub
Jan 21, 2025
Browse files
[Misc] Remove experimental dep from tracing.py (#12007)
Signed-off-by:
Adrian Cole
<
adrian.cole@elastic.co
>
parent
18fd4a83
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
66 additions
and
60 deletions
+66
-60
tests/tracing/test_tracing.py
tests/tracing/test_tracing.py
+30
-30
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+15
-17
vllm/tracing.py
vllm/tracing.py
+21
-13
No files found.
tests/tracing/test_tracing.py
View file @
347eeebe
...
@@ -100,32 +100,32 @@ def test_traces(trace_service):
...
@@ -100,32 +100,32 @@ def test_traces(trace_service):
attributes
=
decode_attributes
(
attributes
=
decode_attributes
(
request
.
resource_spans
[
0
].
scope_spans
[
0
].
spans
[
0
].
attributes
)
request
.
resource_spans
[
0
].
scope_spans
[
0
].
spans
[
0
].
attributes
)
assert
attributes
.
get
(
SpanAttributes
.
LLM
_RESPONSE_MODEL
)
==
model
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI
_RESPONSE_MODEL
)
==
model
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM_REQUEST_ID
)
==
outputs
[
0
].
request_id
SpanAttributes
.
GEN_AI_REQUEST_ID
)
==
outputs
[
0
].
request_id
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_TEMPERATURE
)
==
sampling_params
.
temperature
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_REQUEST_T
EMPERATURE
)
==
sampling_params
.
t
emperature
SpanAttributes
.
GEN_AI
_REQUEST_T
OP_P
)
==
sampling_params
.
t
op_p
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM_REQUEST_TOP_P
)
==
sampling_params
.
top_p
SpanAttributes
.
GEN_AI_REQUEST_MAX_TOKENS
)
==
sampling_params
.
max_tokens
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_N
)
==
sampling_params
.
n
SpanAttributes
.
LLM_REQUEST_MAX_TOKENS
)
==
sampling_params
.
max_tokens
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_USAGE_PROMPT_TOKENS
)
==
len
(
assert
attributes
.
get
(
SpanAttributes
.
LLM_REQUEST_N
)
==
sampling_params
.
n
assert
attributes
.
get
(
SpanAttributes
.
LLM_USAGE_PROMPT_TOKENS
)
==
len
(
outputs
[
0
].
prompt_token_ids
)
outputs
[
0
].
prompt_token_ids
)
completion_tokens
=
sum
(
len
(
o
.
token_ids
)
for
o
in
outputs
[
0
].
outputs
)
completion_tokens
=
sum
(
len
(
o
.
token_ids
)
for
o
in
outputs
[
0
].
outputs
)
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_USAGE_COMPLETION_TOKENS
)
==
completion_tokens
SpanAttributes
.
GEN_AI
_USAGE_COMPLETION_TOKENS
)
==
completion_tokens
metrics
=
outputs
[
0
].
metrics
metrics
=
outputs
[
0
].
metrics
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_QUEUE
)
==
metrics
.
time_in_queue
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_QUEUE
)
==
metrics
.
time_in_queue
ttft
=
metrics
.
first_token_time
-
metrics
.
arrival_time
ttft
=
metrics
.
first_token_time
-
metrics
.
arrival_time
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_LATENCY_TIME_TO_FIRST_TOKEN
)
==
ttft
SpanAttributes
.
GEN_AI
_LATENCY_TIME_TO_FIRST_TOKEN
)
==
ttft
e2e_time
=
metrics
.
finished_time
-
metrics
.
arrival_time
e2e_time
=
metrics
.
finished_time
-
metrics
.
arrival_time
assert
attributes
.
get
(
SpanAttributes
.
LLM
_LATENCY_E2E
)
==
e2e_time
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI
_LATENCY_E2E
)
==
e2e_time
assert
metrics
.
scheduler_time
>
0
assert
metrics
.
scheduler_time
>
0
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_LATENCY_TIME_IN_SCHEDULER
SpanAttributes
.
LLM_LATENCY_TIME_IN_SCHEDULER
)
==
metrics
.
scheduler_time
)
==
metrics
.
scheduler_time
# Model forward and model execute should be none, since detailed traces is
# Model forward and model execute should be none, since detailed traces is
# not enabled.
# not enabled.
assert
metrics
.
model_forward_time
is
None
assert
metrics
.
model_forward_time
is
None
...
@@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service):
...
@@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service):
attributes
=
decode_attributes
(
attributes
=
decode_attributes
(
request
.
resource_spans
[
0
].
scope_spans
[
0
].
spans
[
0
].
attributes
)
request
.
resource_spans
[
0
].
scope_spans
[
0
].
spans
[
0
].
attributes
)
assert
attributes
.
get
(
SpanAttributes
.
LLM
_RESPONSE_MODEL
)
==
model
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI
_RESPONSE_MODEL
)
==
model
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM_REQUEST_ID
)
==
outputs
[
0
].
request_id
SpanAttributes
.
GEN_AI_REQUEST_ID
)
==
outputs
[
0
].
request_id
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_TEMPERATURE
)
==
sampling_params
.
temperature
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_REQUEST_T
EMPERATURE
)
==
sampling_params
.
t
emperature
SpanAttributes
.
GEN_AI
_REQUEST_T
OP_P
)
==
sampling_params
.
t
op_p
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM_REQUEST_TOP_P
)
==
sampling_params
.
top_p
SpanAttributes
.
GEN_AI_REQUEST_MAX_TOKENS
)
==
sampling_params
.
max_tokens
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_N
)
==
sampling_params
.
n
SpanAttributes
.
LLM_REQUEST_MAX_TOKENS
)
==
sampling_params
.
max_tokens
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_USAGE_PROMPT_TOKENS
)
==
len
(
assert
attributes
.
get
(
SpanAttributes
.
LLM_REQUEST_N
)
==
sampling_params
.
n
assert
attributes
.
get
(
SpanAttributes
.
LLM_USAGE_PROMPT_TOKENS
)
==
len
(
outputs
[
0
].
prompt_token_ids
)
outputs
[
0
].
prompt_token_ids
)
completion_tokens
=
sum
(
len
(
o
.
token_ids
)
for
o
in
outputs
[
0
].
outputs
)
completion_tokens
=
sum
(
len
(
o
.
token_ids
)
for
o
in
outputs
[
0
].
outputs
)
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_USAGE_COMPLETION_TOKENS
)
==
completion_tokens
SpanAttributes
.
GEN_AI
_USAGE_COMPLETION_TOKENS
)
==
completion_tokens
metrics
=
outputs
[
0
].
metrics
metrics
=
outputs
[
0
].
metrics
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_QUEUE
)
==
metrics
.
time_in_queue
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_QUEUE
)
==
metrics
.
time_in_queue
ttft
=
metrics
.
first_token_time
-
metrics
.
arrival_time
ttft
=
metrics
.
first_token_time
-
metrics
.
arrival_time
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_LATENCY_TIME_TO_FIRST_TOKEN
)
==
ttft
SpanAttributes
.
GEN_AI
_LATENCY_TIME_TO_FIRST_TOKEN
)
==
ttft
e2e_time
=
metrics
.
finished_time
-
metrics
.
arrival_time
e2e_time
=
metrics
.
finished_time
-
metrics
.
arrival_time
assert
attributes
.
get
(
SpanAttributes
.
LLM
_LATENCY_E2E
)
==
e2e_time
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI
_LATENCY_E2E
)
==
e2e_time
assert
metrics
.
scheduler_time
>
0
assert
metrics
.
scheduler_time
>
0
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_LATENCY_TIME_IN_SCHEDULER
SpanAttributes
.
LLM_LATENCY_TIME_IN_SCHEDULER
)
==
metrics
.
scheduler_time
)
==
metrics
.
scheduler_time
assert
metrics
.
model_forward_time
>
0
assert
metrics
.
model_forward_time
>
0
assert
attributes
.
get
(
assert
attributes
.
get
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_MODEL_FORWARD
)
==
pytest
.
approx
(
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_MODEL_FORWARD
)
==
pytest
.
approx
(
metrics
.
model_forward_time
/
1000
)
metrics
.
model_forward_time
/
1000
)
assert
metrics
.
model_execute_time
>
0
assert
metrics
.
model_execute_time
>
0
assert
attributes
.
get
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_MODEL_EXECUTE
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_MODEL_EXECUTE
)
==
metrics
.
model_execute_time
)
==
metrics
.
model_execute_time
assert
metrics
.
model_forward_time
<
1000
*
metrics
.
model_execute_time
assert
metrics
.
model_forward_time
<
1000
*
metrics
.
model_execute_time
vllm/engine/llm_engine.py
View file @
347eeebe
...
@@ -1857,46 +1857,44 @@ class LLMEngine:
...
@@ -1857,46 +1857,44 @@ class LLMEngine:
metrics
=
seq_group
.
metrics
metrics
=
seq_group
.
metrics
ttft
=
metrics
.
first_token_time
-
metrics
.
arrival_time
ttft
=
metrics
.
first_token_time
-
metrics
.
arrival_time
e2e_time
=
metrics
.
finished_time
-
metrics
.
arrival_time
e2e_time
=
metrics
.
finished_time
-
metrics
.
arrival_time
# attribute names are based on
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI_RESPONSE_MODEL
,
# https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
seq_span
.
set_attribute
(
SpanAttributes
.
LLM_RESPONSE_MODEL
,
self
.
model_config
.
model
)
self
.
model_config
.
model
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_ID
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_ID
,
seq_group
.
request_id
)
seq_group
.
request_id
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_TEMPERATURE
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_TEMPERATURE
,
seq_group
.
sampling_params
.
temperature
)
seq_group
.
sampling_params
.
temperature
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_TOP_P
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_TOP_P
,
seq_group
.
sampling_params
.
top_p
)
seq_group
.
sampling_params
.
top_p
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_MAX_TOKENS
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_MAX_TOKENS
,
seq_group
.
sampling_params
.
max_tokens
)
seq_group
.
sampling_params
.
max_tokens
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_N
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_N
,
seq_group
.
sampling_params
.
n
)
seq_group
.
sampling_params
.
n
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_USAGE_NUM_SEQUENCES
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_USAGE_NUM_SEQUENCES
,
seq_group
.
num_seqs
())
seq_group
.
num_seqs
())
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_USAGE_PROMPT_TOKENS
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_USAGE_PROMPT_TOKENS
,
len
(
seq_group
.
prompt_token_ids
))
len
(
seq_group
.
prompt_token_ids
))
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_USAGE_COMPLETION_TOKENS
,
SpanAttributes
.
GEN_AI
_USAGE_COMPLETION_TOKENS
,
sum
([
sum
([
seq
.
get_output_len
()
seq
.
get_output_len
()
for
seq
in
seq_group
.
get_finished_seqs
()
for
seq
in
seq_group
.
get_finished_seqs
()
]))
]))
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_QUEUE
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_QUEUE
,
metrics
.
time_in_queue
)
metrics
.
time_in_queue
)
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_TO_FIRST_TOKEN
,
ttft
)
SpanAttributes
.
GEN_AI
_LATENCY_TIME_TO_FIRST_TOKEN
,
ttft
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_E2E
,
e2e_time
)
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_LATENCY_E2E
,
e2e_time
)
if
metrics
.
scheduler_time
is
not
None
:
if
metrics
.
scheduler_time
is
not
None
:
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_SCHEDULER
,
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_SCHEDULER
,
metrics
.
scheduler_time
)
metrics
.
scheduler_time
)
if
metrics
.
model_forward_time
is
not
None
:
if
metrics
.
model_forward_time
is
not
None
:
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_MODEL_FORWARD
,
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_MODEL_FORWARD
,
metrics
.
model_forward_time
/
1000.0
)
metrics
.
model_forward_time
/
1000.0
)
if
metrics
.
model_execute_time
is
not
None
:
if
metrics
.
model_execute_time
is
not
None
:
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_MODEL_EXECUTE
,
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_MODEL_EXECUTE
,
metrics
.
model_execute_time
)
metrics
.
model_execute_time
)
def
_validate_model_inputs
(
self
,
inputs
:
ProcessorInputs
,
def
_validate_model_inputs
(
self
,
inputs
:
ProcessorInputs
,
...
...
vllm/tracing.py
View file @
347eeebe
...
@@ -16,7 +16,6 @@ try:
...
@@ -16,7 +16,6 @@ try:
OTEL_EXPORTER_OTLP_TRACES_PROTOCOL
)
OTEL_EXPORTER_OTLP_TRACES_PROTOCOL
)
from
opentelemetry.sdk.trace
import
TracerProvider
from
opentelemetry.sdk.trace
import
TracerProvider
from
opentelemetry.sdk.trace.export
import
BatchSpanProcessor
from
opentelemetry.sdk.trace.export
import
BatchSpanProcessor
from
opentelemetry.semconv_ai
import
SpanAttributes
as
BaseSpanAttributes
from
opentelemetry.trace
import
SpanKind
,
Tracer
,
set_tracer_provider
from
opentelemetry.trace
import
SpanKind
,
Tracer
,
set_tracer_provider
from
opentelemetry.trace.propagation.tracecontext
import
(
from
opentelemetry.trace.propagation.tracecontext
import
(
TraceContextTextMapPropagator
)
TraceContextTextMapPropagator
)
...
@@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
...
@@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
return
{
h
:
headers
[
h
]
for
h
in
TRACE_HEADERS
if
h
in
headers
}
return
{
h
:
headers
[
h
]
for
h
in
TRACE_HEADERS
if
h
in
headers
}
class
SpanAttributes
(
BaseSpanAttributes
):
class
SpanAttributes
:
# The following span attribute names are added here because they are missing
# Attribute names copied from here to avoid version conflicts:
# from the Semantic Conventions for LLM.
# https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md
LLM_REQUEST_ID
=
"gen_ai.request.id"
GEN_AI_USAGE_COMPLETION_TOKENS
=
"gen_ai.usage.completion_tokens"
LLM_REQUEST_N
=
"gen_ai.request.n"
GEN_AI_USAGE_PROMPT_TOKENS
=
"gen_ai.usage.prompt_tokens"
LLM_USAGE_NUM_SEQUENCES
=
"gen_ai.usage.num_sequences"
GEN_AI_REQUEST_MAX_TOKENS
=
"gen_ai.request.max_tokens"
LLM_LATENCY_TIME_IN_QUEUE
=
"gen_ai.latency.time_in_queue"
GEN_AI_REQUEST_TOP_P
=
"gen_ai.request.top_p"
LLM_LATENCY_TIME_TO_FIRST_TOKEN
=
"gen_ai.latency.time_to_first_token"
GEN_AI_REQUEST_TEMPERATURE
=
"gen_ai.request.temperature"
LLM_LATENCY_E2E
=
"gen_ai.latency.e2e"
GEN_AI_RESPONSE_MODEL
=
"gen_ai.response.model"
LLM_LATENCY_TIME_IN_SCHEDULER
=
"gen_ai.latency.time_in_scheduler"
# Attribute names added until they are added to the semantic conventions:
GEN_AI_REQUEST_ID
=
"gen_ai.request.id"
GEN_AI_REQUEST_N
=
"gen_ai.request.n"
GEN_AI_USAGE_NUM_SEQUENCES
=
"gen_ai.usage.num_sequences"
GEN_AI_LATENCY_TIME_IN_QUEUE
=
"gen_ai.latency.time_in_queue"
GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN
=
"gen_ai.latency.time_to_first_token"
GEN_AI_LATENCY_E2E
=
"gen_ai.latency.e2e"
GEN_AI_LATENCY_TIME_IN_SCHEDULER
=
"gen_ai.latency.time_in_scheduler"
# Time taken in the forward pass for this across all workers
# Time taken in the forward pass for this across all workers
LLM_LATENCY_TIME_IN_MODEL_FORWARD
=
"gen_ai.latency.time_in_model_forward"
GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
=
(
"gen_ai.latency.time_in_model_forward"
)
# Time taken in the model execute function. This will include model
# Time taken in the model execute function. This will include model
# forward, block/sync across workers, cpu-gpu sync time and sampling time.
# forward, block/sync across workers, cpu-gpu sync time and sampling time.
LLM_LATENCY_TIME_IN_MODEL_EXECUTE
=
"gen_ai.latency.time_in_model_execute"
GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
=
(
"gen_ai.latency.time_in_model_execute"
)
def
contains_trace_headers
(
headers
:
Mapping
[
str
,
str
])
->
bool
:
def
contains_trace_headers
(
headers
:
Mapping
[
str
,
str
])
->
bool
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment