Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
a485ab78
Unverified
Commit
a485ab78
authored
Aug 26, 2025
by
Chi McIsaac
Committed by
GitHub
Aug 26, 2025
Browse files
feat: align OpenAI response IDs with distributed trace IDs (#2496)
parent
5d177b61
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
60 additions
and
93 deletions
+60
-93
lib/engines/mistralrs/src/lib.rs
lib/engines/mistralrs/src/lib.rs
+19
-18
lib/llm/src/engines.rs
lib/llm/src/engines.rs
+2
-3
lib/llm/src/http/service/openai.rs
lib/llm/src/http/service/openai.rs
+5
-8
lib/llm/src/preprocessor.rs
lib/llm/src/preprocessor.rs
+2
-7
lib/llm/src/protocols/openai/chat_completions/delta.rs
lib/llm/src/protocols/openai/chat_completions/delta.rs
+14
-12
lib/llm/src/protocols/openai/completions/delta.rs
lib/llm/src/protocols/openai/completions/delta.rs
+6
-16
lib/llm/tests/http-service.rs
lib/llm/tests/http-service.rs
+12
-29
No files found.
lib/engines/mistralrs/src/lib.rs
View file @
a485ab78
...
@@ -212,9 +212,9 @@ impl MistralRsEngine {
...
@@ -212,9 +212,9 @@ impl MistralRsEngine {
// Perform warmup request
// Perform warmup request
let
(
tx
,
mut
rx
)
=
channel
(
1
);
let
(
tx
,
mut
rx
)
=
channel
(
1
);
let
request_id
=
engine
.mistralrs
.next_request_id
();
let
mistralrs_
request_id
=
engine
.mistralrs
.next_request_id
();
let
warmup_request
=
Request
::
Normal
(
Box
::
new
(
NormalRequest
{
let
warmup_request
=
Request
::
Normal
(
Box
::
new
(
NormalRequest
{
id
:
request_id
,
id
:
mistralrs_
request_id
,
model_id
:
Some
(
display_name
.to_string
()),
model_id
:
Some
(
display_name
.to_string
()),
messages
:
RequestMessage
::
Chat
{
messages
:
RequestMessage
::
Chat
{
messages
:
vec!
[
IndexMap
::
from
([
messages
:
vec!
[
IndexMap
::
from
([
...
@@ -246,10 +246,10 @@ impl MistralRsEngine {
...
@@ -246,10 +246,10 @@ impl MistralRsEngine {
{
{
match
response
.as_result
()
{
match
response
.as_result
()
{
Ok
(
r
)
=>
{
Ok
(
r
)
=>
{
tracing
::
debug!
(
request_id
,
"Warmup response: {r:?}"
);
tracing
::
debug!
(
mistralrs_
request_id
,
"Warmup response: {r:?}"
);
}
}
Err
(
err
)
=>
{
Err
(
err
)
=>
{
tracing
::
error!
(
request_id
,
%
err
,
"Failed converting response to result."
);
tracing
::
error!
(
mistralrs_
request_id
,
%
err
,
"Failed converting response to result."
);
}
}
}
}
}
}
...
@@ -272,6 +272,7 @@ impl
...
@@ -272,6 +272,7 @@ impl
)
->
Result
<
ManyOut
<
Annotated
<
NvCreateChatCompletionStreamResponse
>>
,
Error
>
{
)
->
Result
<
ManyOut
<
Annotated
<
NvCreateChatCompletionStreamResponse
>>
,
Error
>
{
let
(
request
,
context
)
=
request
.transfer
(());
let
(
request
,
context
)
=
request
.transfer
(());
let
ctx
=
context
.context
();
let
ctx
=
context
.context
();
let
request_id
=
ctx
.id
()
.to_string
();
let
(
tx
,
mut
rx
)
=
channel
(
10_000
);
let
(
tx
,
mut
rx
)
=
channel
(
10_000
);
let
mut
messages
=
vec!
[];
let
mut
messages
=
vec!
[];
...
@@ -338,9 +339,9 @@ impl
...
@@ -338,9 +339,9 @@ impl
n_choices
:
1
,
n_choices
:
1
,
dry_params
:
det
.dry_params
,
dry_params
:
det
.dry_params
,
};
};
let
request_id
=
self
.mistralrs
.next_request_id
();
let
mistralrs_
request_id
=
self
.mistralrs
.next_request_id
();
let
mistralrs_request
=
Request
::
Normal
(
Box
::
new
(
NormalRequest
{
let
mistralrs_request
=
Request
::
Normal
(
Box
::
new
(
NormalRequest
{
id
:
request_id
,
id
:
mistralrs_
request_id
,
model_id
:
Some
(
self
.display_name
.clone
()),
model_id
:
Some
(
self
.display_name
.clone
()),
messages
:
RequestMessage
::
Chat
{
messages
:
RequestMessage
::
Chat
{
messages
,
messages
,
...
@@ -369,14 +370,14 @@ impl
...
@@ -369,14 +370,14 @@ impl
let
response
=
match
response
.as_result
()
{
let
response
=
match
response
.as_result
()
{
Ok
(
r
)
=>
r
,
Ok
(
r
)
=>
r
,
Err
(
err
)
=>
{
Err
(
err
)
=>
{
tracing
::
error!
(
request_id
,
%
err
,
"Failed converting mistralrs channel response to result."
);
tracing
::
error!
(
mistralrs_
request_id
,
%
err
,
"Failed converting mistralrs channel response to result."
);
break
;
break
;
}
}
};
};
match
response
{
match
response
{
ResponseOk
::
Chunk
(
c
)
=>
{
ResponseOk
::
Chunk
(
c
)
=>
{
let
Some
(
from_assistant
)
=
c
.choices
[
0
]
.delta.content
.clone
()
else
{
let
Some
(
from_assistant
)
=
c
.choices
[
0
]
.delta.content
.clone
()
else
{
tracing
::
warn!
(
request_id
,
"No content from mistralrs. Abandoning request."
);
tracing
::
warn!
(
mistralrs_
request_id
,
"No content from mistralrs. Abandoning request."
);
break
;
break
;
};
};
let
finish_reason
=
match
&
c
.choices
[
0
]
.finish_reason
.as_deref
()
{
let
finish_reason
=
match
&
c
.choices
[
0
]
.finish_reason
.as_deref
()
{
...
@@ -387,7 +388,7 @@ impl
...
@@ -387,7 +388,7 @@ impl
Some
(
FinishReason
::
Length
)
Some
(
FinishReason
::
Length
)
}
}
Some
(
s
)
=>
{
Some
(
s
)
=>
{
tracing
::
warn!
(
request_id
,
stop_reason
=
s
,
"Unknow stop reason"
);
tracing
::
warn!
(
mistralrs_
request_id
,
stop_reason
=
s
,
"Unknow stop reason"
);
Some
(
FinishReason
::
Stop
)
Some
(
FinishReason
::
Stop
)
}
}
None
=>
None
,
None
=>
None
,
...
@@ -396,7 +397,7 @@ impl
...
@@ -396,7 +397,7 @@ impl
#[allow(deprecated)]
#[allow(deprecated)]
let
delta
=
NvCreateChatCompletionStreamResponse
{
let
delta
=
NvCreateChatCompletionStreamResponse
{
id
:
c
.id
,
id
:
format!
(
"chatcmpl-{request_id}"
)
,
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoiceStream
{
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
0
,
index
:
0
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
...
@@ -427,11 +428,11 @@ impl
...
@@ -427,11 +428,11 @@ impl
yield
ann
;
yield
ann
;
if
finish_reason
.is_some
()
{
if
finish_reason
.is_some
()
{
//tracing::trace!(request_id, "Finish reason: {finish_reason:?}");
//tracing::trace!(
mistralrs_
request_id, "Finish reason: {finish_reason:?}");
break
;
break
;
}
}
},
},
x
=>
tracing
::
error!
(
request_id
,
"Unhandled. {x:?}"
),
x
=>
tracing
::
error!
(
mistralrs_
request_id
,
"Unhandled. {x:?}"
),
}
}
}
}
};
};
...
@@ -485,7 +486,7 @@ impl
...
@@ -485,7 +486,7 @@ impl
let
(
request
,
context
)
=
request
.transfer
(());
let
(
request
,
context
)
=
request
.transfer
(());
let
ctx
=
context
.context
();
let
ctx
=
context
.context
();
let
(
tx
,
mut
rx
)
=
channel
(
10_000
);
let
(
tx
,
mut
rx
)
=
channel
(
10_000
);
let
response_generator
=
request
.response_generator
();
let
response_generator
=
request
.response_generator
(
ctx
.id
()
.to_string
()
);
let
messages
=
RequestMessage
::
Completion
{
let
messages
=
RequestMessage
::
Completion
{
text
:
prompt_to_string
(
&
request
.inner.prompt
),
text
:
prompt_to_string
(
&
request
.inner.prompt
),
...
@@ -539,9 +540,9 @@ impl
...
@@ -539,9 +540,9 @@ impl
dry_params
:
det
.dry_params
,
dry_params
:
det
.dry_params
,
};
};
let
request_id
=
self
.mistralrs
.next_request_id
();
let
mistralrs_
request_id
=
self
.mistralrs
.next_request_id
();
let
mistralrs_request
=
Request
::
Normal
(
Box
::
new
(
NormalRequest
{
let
mistralrs_request
=
Request
::
Normal
(
Box
::
new
(
NormalRequest
{
id
:
request_id
,
id
:
mistralrs_
request_id
,
model_id
:
Some
(
self
.display_name
.clone
()),
model_id
:
Some
(
self
.display_name
.clone
()),
messages
,
messages
,
sampling_params
,
sampling_params
,
...
@@ -567,7 +568,7 @@ impl
...
@@ -567,7 +568,7 @@ impl
let
response
=
match
response
.as_result
()
{
let
response
=
match
response
.as_result
()
{
Ok
(
r
)
=>
r
,
Ok
(
r
)
=>
r
,
Err
(
err
)
=>
{
Err
(
err
)
=>
{
tracing
::
error!
(
request_id
,
%
err
,
"Failed converting mistralrs channel response to result."
);
tracing
::
error!
(
mistralrs_
request_id
,
%
err
,
"Failed converting mistralrs channel response to result."
);
break
;
break
;
}
}
};
};
...
@@ -583,7 +584,7 @@ impl
...
@@ -583,7 +584,7 @@ impl
Some
(
FinishReason
::
Length
)
Some
(
FinishReason
::
Length
)
}
}
Some
(
s
)
=>
{
Some
(
s
)
=>
{
tracing
::
warn!
(
request_id
,
stop_reason
=
s
,
"Unknow stop reason"
);
tracing
::
warn!
(
mistralrs_
request_id
,
stop_reason
=
s
,
"Unknow stop reason"
);
Some
(
FinishReason
::
Stop
)
Some
(
FinishReason
::
Stop
)
}
}
None
=>
None
,
None
=>
None
,
...
@@ -602,7 +603,7 @@ impl
...
@@ -602,7 +603,7 @@ impl
break
;
break
;
}
}
},
},
x
=>
tracing
::
error!
(
request_id
,
"Unhandled. {x:?}"
),
x
=>
tracing
::
error!
(
mistralrs_
request_id
,
"Unhandled. {x:?}"
),
}
}
}
}
};
};
...
...
lib/llm/src/engines.rs
View file @
a485ab78
...
@@ -14,7 +14,6 @@ use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
...
@@ -14,7 +14,6 @@ use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use
dynamo_runtime
::
protocols
::
annotated
::
Annotated
;
use
dynamo_runtime
::
protocols
::
annotated
::
Annotated
;
use
crate
::
backend
::
ExecutionContext
;
use
crate
::
backend
::
ExecutionContext
;
use
crate
::
local_model
::
runtime_config
;
use
crate
::
preprocessor
::
PreprocessedRequest
;
use
crate
::
preprocessor
::
PreprocessedRequest
;
use
crate
::
protocols
::
common
::
llm_backend
::
LLMEngineOutput
;
use
crate
::
protocols
::
common
::
llm_backend
::
LLMEngineOutput
;
use
crate
::
protocols
::
openai
::{
use
crate
::
protocols
::
openai
::{
...
@@ -184,8 +183,8 @@ impl
...
@@ -184,8 +183,8 @@ impl
incoming_request
:
SingleIn
<
NvCreateChatCompletionRequest
>
,
incoming_request
:
SingleIn
<
NvCreateChatCompletionRequest
>
,
)
->
Result
<
ManyOut
<
Annotated
<
NvCreateChatCompletionStreamResponse
>>
,
Error
>
{
)
->
Result
<
ManyOut
<
Annotated
<
NvCreateChatCompletionStreamResponse
>>
,
Error
>
{
let
(
request
,
context
)
=
incoming_request
.transfer
(());
let
(
request
,
context
)
=
incoming_request
.transfer
(());
let
mut
deltas
=
request
.response_generator
(
runtime_config
::
ModelRuntimeConfig
::
default
());
let
ctx
=
context
.context
();
let
ctx
=
context
.context
();
let
mut
deltas
=
request
.response_generator
(
ctx
.id
()
.to_string
());
let
req
=
request
.inner.messages
.into_iter
()
.next_back
()
.unwrap
();
let
req
=
request
.inner.messages
.into_iter
()
.next_back
()
.unwrap
();
let
prompt
=
match
req
{
let
prompt
=
match
req
{
...
@@ -231,8 +230,8 @@ impl
...
@@ -231,8 +230,8 @@ impl
incoming_request
:
SingleIn
<
NvCreateCompletionRequest
>
,
incoming_request
:
SingleIn
<
NvCreateCompletionRequest
>
,
)
->
Result
<
ManyOut
<
Annotated
<
NvCreateCompletionResponse
>>
,
Error
>
{
)
->
Result
<
ManyOut
<
Annotated
<
NvCreateCompletionResponse
>>
,
Error
>
{
let
(
request
,
context
)
=
incoming_request
.transfer
(());
let
(
request
,
context
)
=
incoming_request
.transfer
(());
let
deltas
=
request
.response_generator
();
let
ctx
=
context
.context
();
let
ctx
=
context
.context
();
let
deltas
=
request
.response_generator
(
ctx
.id
()
.to_string
());
let
chars_string
=
prompt_to_string
(
&
request
.inner.prompt
);
let
chars_string
=
prompt_to_string
(
&
request
.inner.prompt
);
let
output
=
stream!
{
let
output
=
stream!
{
let
mut
id
=
1
;
let
mut
id
=
1
;
...
...
lib/llm/src/http/service/openai.rs
View file @
a485ab78
...
@@ -253,8 +253,7 @@ async fn completions(
...
@@ -253,8 +253,7 @@ async fn completions(
// return a 503 if the service is not ready
// return a 503 if the service is not ready
check_ready
(
&
state
)
?
;
check_ready
(
&
state
)
?
;
// todo - extract distributed tracing id and context id from headers
let
request_id
=
request
.id
()
.to_string
();
let
request_id
=
uuid
::
Uuid
::
new_v4
()
.to_string
();
// todo - decide on default
// todo - decide on default
let
streaming
=
request
.inner.stream
.unwrap_or
(
false
);
let
streaming
=
request
.inner.stream
.unwrap_or
(
false
);
...
@@ -354,13 +353,15 @@ async fn completions(
...
@@ -354,13 +353,15 @@ async fn completions(
#[tracing::instrument(skip_all)]
#[tracing::instrument(skip_all)]
async
fn
embeddings
(
async
fn
embeddings
(
State
(
state
):
State
<
Arc
<
service_v2
::
State
>>
,
State
(
state
):
State
<
Arc
<
service_v2
::
State
>>
,
headers
:
HeaderMap
,
Json
(
request
):
Json
<
NvCreateEmbeddingRequest
>
,
Json
(
request
):
Json
<
NvCreateEmbeddingRequest
>
,
)
->
Result
<
Response
,
ErrorResponse
>
{
)
->
Result
<
Response
,
ErrorResponse
>
{
// return a 503 if the service is not ready
// return a 503 if the service is not ready
check_ready
(
&
state
)
?
;
check_ready
(
&
state
)
?
;
// todo - extract distributed tracing id and context id from headers
let
request_id
=
get_or_create_request_id
(
request
.inner.user
.as_deref
(),
&
headers
);
let
request_id
=
uuid
::
Uuid
::
new_v4
()
.to_string
();
let
request
=
Context
::
with_id
(
request
,
request_id
);
let
request_id
=
request
.id
()
.to_string
();
// Embeddings are typically not streamed, so we default to non-streaming
// Embeddings are typically not streamed, so we default to non-streaming
let
streaming
=
false
;
let
streaming
=
false
;
...
@@ -381,10 +382,6 @@ async fn embeddings(
...
@@ -381,10 +382,6 @@ async fn embeddings(
.metrics_clone
()
.metrics_clone
()
.create_inflight_guard
(
model
,
Endpoint
::
Embeddings
,
streaming
);
.create_inflight_guard
(
model
,
Endpoint
::
Embeddings
,
streaming
);
// setup context
// todo - inherit request_id from distributed trace details
let
request
=
Context
::
with_id
(
request
,
request_id
.clone
());
// issue the generate call on the engine
// issue the generate call on the engine
let
stream
=
engine
let
stream
=
engine
.generate
(
request
)
.generate
(
request
)
...
...
lib/llm/src/preprocessor.rs
View file @
a485ab78
...
@@ -22,7 +22,6 @@ use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
...
@@ -22,7 +22,6 @@ use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use
std
::{
collections
::
HashMap
,
sync
::
Arc
};
use
std
::{
collections
::
HashMap
,
sync
::
Arc
};
use
tracing
;
use
tracing
;
use
crate
::
local_model
::
runtime_config
::
ModelRuntimeConfig
;
use
crate
::
model_card
::{
ModelDeploymentCard
,
ModelInfo
,
TokenizerKind
};
use
crate
::
model_card
::{
ModelDeploymentCard
,
ModelInfo
,
TokenizerKind
};
use
crate
::
preprocessor
::
prompt
::
OAIChatLikeRequest
;
use
crate
::
preprocessor
::
prompt
::
OAIChatLikeRequest
;
use
crate
::
tokenizers
::
Encoding
;
use
crate
::
tokenizers
::
Encoding
;
...
@@ -95,7 +94,6 @@ pub struct OpenAIPreprocessor {
...
@@ -95,7 +94,6 @@ pub struct OpenAIPreprocessor {
formatter
:
Arc
<
dyn
OAIPromptFormatter
>
,
formatter
:
Arc
<
dyn
OAIPromptFormatter
>
,
tokenizer
:
Arc
<
dyn
Tokenizer
>
,
tokenizer
:
Arc
<
dyn
Tokenizer
>
,
model_info
:
Arc
<
dyn
ModelInfo
>
,
model_info
:
Arc
<
dyn
ModelInfo
>
,
runtime_config
:
ModelRuntimeConfig
,
}
}
impl
OpenAIPreprocessor
{
impl
OpenAIPreprocessor
{
...
@@ -123,14 +121,11 @@ impl OpenAIPreprocessor {
...
@@ -123,14 +121,11 @@ impl OpenAIPreprocessor {
};
};
let
model_info
=
model_info
.get_model_info
()
.await
?
;
let
model_info
=
model_info
.get_model_info
()
.await
?
;
let
runtime_config
=
mdc
.runtime_config
.clone
();
Ok
(
Arc
::
new
(
Self
{
Ok
(
Arc
::
new
(
Self
{
formatter
,
formatter
,
tokenizer
,
tokenizer
,
model_info
,
model_info
,
mdcsum
,
mdcsum
,
runtime_config
,
}))
}))
}
}
...
@@ -499,7 +494,7 @@ impl
...
@@ -499,7 +494,7 @@ impl
let
(
request
,
context
)
=
request
.into_parts
();
let
(
request
,
context
)
=
request
.into_parts
();
// create a response generator
// create a response generator
let
response_generator
=
request
.response_generator
(
self
.runtime_config
.clone
());
let
response_generator
=
request
.response_generator
(
context
.id
()
.to_string
());
let
mut
response_generator
=
Box
::
new
(
response_generator
);
let
mut
response_generator
=
Box
::
new
(
response_generator
);
// convert the chat completion request to a common completion request
// convert the chat completion request to a common completion request
...
@@ -553,7 +548,7 @@ impl
...
@@ -553,7 +548,7 @@ impl
let
(
request
,
context
)
=
request
.into_parts
();
let
(
request
,
context
)
=
request
.into_parts
();
// create a response generator
// create a response generator
let
response_generator
=
request
.response_generator
();
let
response_generator
=
request
.response_generator
(
context
.id
()
.to_string
()
);
let
mut
response_generator
=
Box
::
new
(
response_generator
);
let
mut
response_generator
=
Box
::
new
(
response_generator
);
// convert the chat completion request to a common completion request
// convert the chat completion request to a common completion request
let
(
common_request
,
annotations
)
=
self
.preprocess_request
(
&
request
)
?
;
let
(
common_request
,
annotations
)
=
self
.preprocess_request
(
&
request
)
?
;
...
...
lib/llm/src/protocols/openai/chat_completions/delta.rs
View file @
a485ab78
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
// SPDX-License-Identifier: Apache-2.0
use
dynamo_parsers
::{
ParserResult
,
ReasoningParser
,
ReasoningParserType
,
ReasoningParserWrapper
};
use
super
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
};
use
super
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
};
use
crate
::{
use
crate
::{
local_model
::
runtime_config
,
local_model
::
runtime_config
::
ModelRuntimeConfig
,
protocols
::
common
::{
self
},
protocols
::
common
::{
self
},
types
::
TokenIdType
,
types
::
TokenIdType
,
};
};
use
dynamo_parsers
::{
ParserResult
,
ReasoningParser
,
ReasoningParserType
,
ReasoningParserWrapper
};
/// Provides a method for generating a [`DeltaGenerator`] from a chat completion request.
/// Provides a method for generating a [`DeltaGenerator`] from a chat completion request.
impl
NvCreateChatCompletionRequest
{
impl
NvCreateChatCompletionRequest
{
/// Creates a [`DeltaGenerator`] instance based on the chat completion request.
/// Creates a [`DeltaGenerator`] instance based on the chat completion request.
///
///
/// # Arguments
/// * `request_id` - The request ID to use for the chat completion response ID.
///
/// # Returns
/// # Returns
/// * [`DeltaGenerator`] configured with model name and response options.
/// * [`DeltaGenerator`] configured with model name and response options.
pub
fn
response_generator
(
pub
fn
response_generator
(
&
self
,
request_id
:
String
)
->
DeltaGenerator
{
&
self
,
runtime_config
:
runtime_config
::
ModelRuntimeConfig
,
)
->
DeltaGenerator
{
let
options
=
DeltaGeneratorOptions
{
let
options
=
DeltaGeneratorOptions
{
enable_usage
:
true
,
enable_usage
:
true
,
enable_logprobs
:
self
.inner.logprobs
.unwrap_or
(
false
)
enable_logprobs
:
self
.inner.logprobs
.unwrap_or
(
false
)
||
self
.inner.top_logprobs
.unwrap_or
(
0
)
>
0
,
||
self
.inner.top_logprobs
.unwrap_or
(
0
)
>
0
,
runtime_config
,
runtime_config
:
ModelRuntimeConfig
::
default
()
,
};
};
DeltaGenerator
::
new
(
self
.inner.model
.clone
(),
options
)
DeltaGenerator
::
new
(
self
.inner.model
.clone
(),
options
,
request_id
)
}
}
}
}
...
@@ -39,7 +38,7 @@ pub struct DeltaGeneratorOptions {
...
@@ -39,7 +38,7 @@ pub struct DeltaGeneratorOptions {
/// Determines whether log probabilities should be included in the response.
/// Determines whether log probabilities should be included in the response.
pub
enable_logprobs
:
bool
,
pub
enable_logprobs
:
bool
,
pub
runtime_config
:
runtime_config
::
ModelRuntimeConfig
,
pub
runtime_config
:
ModelRuntimeConfig
,
}
}
/// Generates incremental chat completion responses in a streaming fashion.
/// Generates incremental chat completion responses in a streaming fashion.
...
@@ -74,10 +73,11 @@ impl DeltaGenerator {
...
@@ -74,10 +73,11 @@ impl DeltaGenerator {
/// # Arguments
/// # Arguments
/// * `model` - The model name used for response generation.
/// * `model` - The model name used for response generation.
/// * `options` - Configuration options for enabling usage and log probabilities.
/// * `options` - Configuration options for enabling usage and log probabilities.
/// * `request_id` - The request ID to use for the chat completion response.
///
///
/// # Returns
/// # Returns
/// * A new instance of [`DeltaGenerator`].
/// * A new instance of [`DeltaGenerator`].
pub
fn
new
(
model
:
String
,
options
:
DeltaGeneratorOptions
)
->
Self
{
pub
fn
new
(
model
:
String
,
options
:
DeltaGeneratorOptions
,
request_id
:
String
)
->
Self
{
let
now
=
std
::
time
::
SystemTime
::
now
()
let
now
=
std
::
time
::
SystemTime
::
now
()
.duration_since
(
std
::
time
::
UNIX_EPOCH
)
.duration_since
(
std
::
time
::
UNIX_EPOCH
)
.unwrap
()
.unwrap
()
...
@@ -108,8 +108,10 @@ impl DeltaGenerator {
...
@@ -108,8 +108,10 @@ impl DeltaGenerator {
.unwrap_or
(
"basic"
),
.unwrap_or
(
"basic"
),
);
);
let
chatcmpl_id
=
format!
(
"chatcmpl-{request_id}"
);
Self
{
Self
{
id
:
format!
(
"chatcmpl-{}"
,
uuid
::
Uuid
::
new_v4
())
,
id
:
chatcmpl_id
,
object
:
"chat.completion.chunk"
.to_string
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
:
now
,
created
:
now
,
model
,
model
,
...
...
lib/llm/src/protocols/openai/completions/delta.rs
View file @
a485ab78
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
super
::{
NvCreateCompletionRequest
,
NvCreateCompletionResponse
};
use
super
::{
NvCreateCompletionRequest
,
NvCreateCompletionResponse
};
use
crate
::{
protocols
::
common
,
types
::
TokenIdType
};
use
crate
::{
protocols
::
common
,
types
::
TokenIdType
};
...
@@ -19,13 +7,13 @@ use crate::{protocols::common, types::TokenIdType};
...
@@ -19,13 +7,13 @@ use crate::{protocols::common, types::TokenIdType};
impl
NvCreateCompletionRequest
{
impl
NvCreateCompletionRequest
{
// put this method on the request
// put this method on the request
// inspect the request to extract options
// inspect the request to extract options
pub
fn
response_generator
(
&
self
)
->
DeltaGenerator
{
pub
fn
response_generator
(
&
self
,
request_id
:
String
)
->
DeltaGenerator
{
let
options
=
DeltaGeneratorOptions
{
let
options
=
DeltaGeneratorOptions
{
enable_usage
:
true
,
enable_usage
:
true
,
enable_logprobs
:
self
.inner.logprobs
.unwrap_or
(
0
)
>
0
,
enable_logprobs
:
self
.inner.logprobs
.unwrap_or
(
0
)
>
0
,
};
};
DeltaGenerator
::
new
(
self
.inner.model
.clone
(),
options
)
DeltaGenerator
::
new
(
self
.inner.model
.clone
(),
options
,
request_id
)
}
}
}
}
...
@@ -47,7 +35,7 @@ pub struct DeltaGenerator {
...
@@ -47,7 +35,7 @@ pub struct DeltaGenerator {
}
}
impl
DeltaGenerator
{
impl
DeltaGenerator
{
pub
fn
new
(
model
:
String
,
options
:
DeltaGeneratorOptions
)
->
Self
{
pub
fn
new
(
model
:
String
,
options
:
DeltaGeneratorOptions
,
request_id
:
String
)
->
Self
{
let
now
=
std
::
time
::
SystemTime
::
now
()
let
now
=
std
::
time
::
SystemTime
::
now
()
.duration_since
(
std
::
time
::
UNIX_EPOCH
)
.duration_since
(
std
::
time
::
UNIX_EPOCH
)
.unwrap
()
.unwrap
()
...
@@ -67,8 +55,10 @@ impl DeltaGenerator {
...
@@ -67,8 +55,10 @@ impl DeltaGenerator {
prompt_tokens_details
:
None
,
prompt_tokens_details
:
None
,
};
};
let
completion_id
=
format!
(
"cmpl-{request_id}"
);
Self
{
Self
{
id
:
format!
(
"cmpl-{}"
,
uuid
::
Uuid
::
new_v4
())
,
id
:
completion_id
,
object
:
"text_completion"
.to_string
(),
object
:
"text_completion"
.to_string
(),
created
:
now
,
created
:
now
,
model
,
model
,
...
...
lib/llm/tests/http-service.rs
View file @
a485ab78
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
anyhow
::
Error
;
use
anyhow
::
Error
;
use
async_stream
::
stream
;
use
async_stream
::
stream
;
use
dynamo_async_openai
::
config
::
OpenAIConfig
;
use
dynamo_async_openai
::
config
::
OpenAIConfig
;
use
dynamo_llm
::
http
::{
client
::{
GenericBYOTClient
,
HttpClientConfig
,
HttpRequestContext
,
NvCustomClient
,
PureOpenAIClient
,
},
service
::{
Metrics
,
error
::
HttpError
,
metrics
::{
Endpoint
,
FRONTEND_METRIC_PREFIX
,
RequestType
,
Status
},
service_v2
::
HttpService
,
},
};
use
dynamo_llm
::
protocols
::{
use
dynamo_llm
::
protocols
::{
Annotated
,
Annotated
,
codec
::
SseLineCodec
,
codec
::
SseLineCodec
,
...
@@ -25,21 +24,6 @@ use dynamo_llm::protocols::{
...
@@ -25,21 +24,6 @@ use dynamo_llm::protocols::{
completions
::{
NvCreateCompletionRequest
,
NvCreateCompletionResponse
},
completions
::{
NvCreateCompletionRequest
,
NvCreateCompletionResponse
},
},
},
};
};
use
dynamo_llm
::{
http
::{
client
::{
GenericBYOTClient
,
HttpClientConfig
,
HttpRequestContext
,
NvCustomClient
,
PureOpenAIClient
,
},
service
::{
Metrics
,
error
::
HttpError
,
metrics
::{
Endpoint
,
FRONTEND_METRIC_PREFIX
,
RequestType
,
Status
},
service_v2
::
HttpService
,
},
},
local_model
::
runtime_config
,
};
use
dynamo_runtime
::{
use
dynamo_runtime
::{
CancellationToken
,
CancellationToken
,
engine
::
AsyncEngineContext
,
engine
::
AsyncEngineContext
,
...
@@ -99,8 +83,7 @@ impl
...
@@ -99,8 +83,7 @@ impl
let
max_tokens
=
request
.inner.max_tokens
.unwrap_or
(
0
)
as
u64
;
let
max_tokens
=
request
.inner.max_tokens
.unwrap_or
(
0
)
as
u64
;
// let generator = NvCreateChatCompletionStreamResponse::generator(request.model.clone());
// let generator = NvCreateChatCompletionStreamResponse::generator(request.model.clone());
let
mut
generator
=
let
mut
generator
=
request
.response_generator
(
ctx
.id
()
.to_string
());
request
.response_generator
(
runtime_config
::
ModelRuntimeConfig
::
default
());
let
stream
=
stream!
{
let
stream
=
stream!
{
tokio
::
time
::
sleep
(
std
::
time
::
Duration
::
from_millis
(
max_tokens
))
.await
;
tokio
::
time
::
sleep
(
std
::
time
::
Duration
::
from_millis
(
max_tokens
))
.await
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment