Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
a9e0891c
Unverified
Commit
a9e0891c
authored
Jul 15, 2025
by
Ryan Olson
Committed by
GitHub
Jul 15, 2025
Browse files
feat: adding http clients and recorded response stream (#1919)
parent
4128d583
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1670 additions
and
7 deletions
+1670
-7
Cargo.toml
Cargo.toml
+1
-1
lib/llm/src/http.rs
lib/llm/src/http.rs
+1
-0
lib/llm/src/http/client.rs
lib/llm/src/http/client.rs
+697
-0
lib/llm/src/lib.rs
lib/llm/src/lib.rs
+1
-0
lib/llm/src/perf.rs
lib/llm/src/perf.rs
+554
-0
lib/llm/tests/http-service.rs
lib/llm/tests/http-service.rs
+415
-5
lib/runtime/src/engine.rs
lib/runtime/src/engine.rs
+1
-1
No files found.
Cargo.toml
View file @
a9e0891c
...
...
@@ -45,7 +45,7 @@ dynamo-tokens = { path = "lib/tokens", version = "0.3.2" }
# External dependencies
anyhow
=
{
version
=
"1"
}
async-nats
=
{
version
=
"0.40"
,
features
=
["service"]
}
async-openai
=
{
version
=
"0.29.0"
}
async-openai
=
{
version
=
"0.29.0"
,
features
=
[
"rustls"
,
"byot"
]
}
async-stream
=
{
version
=
"0.3"
}
async-trait
=
{
version
=
"0.1"
}
async_zmq
=
{
version
=
"0.4.0"
}
...
...
lib/llm/src/http.rs
View file @
a9e0891c
...
...
@@ -13,4 +13,5 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub
mod
client
;
pub
mod
service
;
lib/llm/src/http/client.rs
0 → 100644
View file @
a9e0891c
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! HTTP clients for streaming LLM responses with performance recording
//!
//! This module provides HTTP clients that leverage async-openai with BYOT (Bring Your Own Types)
//! feature to work with OpenAI-compatible APIs. The clients support recording streaming responses
//! for performance analysis.
use
std
::
pin
::
Pin
;
use
std
::
sync
::
Arc
;
use
std
::
task
::{
Context
,
Poll
};
use
std
::
time
::
Instant
;
use
async_openai
::{
config
::
OpenAIConfig
,
error
::
OpenAIError
,
Client
};
use
async_trait
::
async_trait
;
use
derive_getters
::
Dissolve
;
use
futures
::
Stream
;
use
serde_json
::
Value
;
use
tokio_util
::
sync
::
CancellationToken
;
use
tracing
;
use
uuid
::
Uuid
;
// Import our existing recording infrastructure
use
crate
::
protocols
::
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
};
use
crate
::
protocols
::
Annotated
;
use
dynamo_runtime
::
engine
::{
AsyncEngineContext
,
AsyncEngineContextProvider
,
AsyncEngineStream
,
Data
,
};
/// Configuration for HTTP clients
#[derive(Clone,
Default)]
pub
struct
HttpClientConfig
{
/// OpenAI API configuration
pub
openai_config
:
OpenAIConfig
,
/// Whether to enable detailed logging
pub
verbose
:
bool
,
}
/// Error types for HTTP clients
#[derive(Debug,
thiserror::Error)]
pub
enum
HttpClientError
{
#[error(
"OpenAI API error: {0}"
)]
OpenAI
(
#[from]
OpenAIError
),
#[error(
"Request timeout"
)]
Timeout
,
#[error(
"Request cancelled"
)]
Cancelled
,
#[error(
"Invalid request: {0}"
)]
InvalidRequest
(
String
),
}
/// Context for HTTP client requests that supports cancellation
/// This bridges AsyncEngineContext and reqwest cancellation
#[derive(Clone)]
pub
struct
HttpRequestContext
{
/// Unique request identifier
id
:
String
,
/// Tokio cancellation token for reqwest integration
cancel_token
:
CancellationToken
,
/// When this context was created
created_at
:
Instant
,
/// Whether the request has been stopped
stopped
:
Arc
<
std
::
sync
::
atomic
::
AtomicBool
>
,
}
impl
HttpRequestContext
{
/// Create a new HTTP request context
pub
fn
new
()
->
Self
{
Self
{
id
:
Uuid
::
new_v4
()
.to_string
(),
cancel_token
:
CancellationToken
::
new
(),
created_at
:
Instant
::
now
(),
stopped
:
Arc
::
new
(
std
::
sync
::
atomic
::
AtomicBool
::
new
(
false
)),
}
}
/// Create a new context with a specific ID
pub
fn
with_id
(
id
:
String
)
->
Self
{
Self
{
id
,
cancel_token
:
CancellationToken
::
new
(),
created_at
:
Instant
::
now
(),
stopped
:
Arc
::
new
(
std
::
sync
::
atomic
::
AtomicBool
::
new
(
false
)),
}
}
/// Create a child context from this parent context
/// The child will be cancelled when the parent is cancelled
pub
fn
child
(
&
self
)
->
Self
{
Self
{
id
:
Uuid
::
new_v4
()
.to_string
(),
cancel_token
:
self
.cancel_token
.child_token
(),
created_at
:
Instant
::
now
(),
stopped
:
Arc
::
new
(
std
::
sync
::
atomic
::
AtomicBool
::
new
(
false
)),
}
}
/// Create a child context with a specific ID
pub
fn
child_with_id
(
&
self
,
id
:
String
)
->
Self
{
Self
{
id
,
cancel_token
:
self
.cancel_token
.child_token
(),
created_at
:
Instant
::
now
(),
stopped
:
Arc
::
new
(
std
::
sync
::
atomic
::
AtomicBool
::
new
(
false
)),
}
}
/// Get the cancellation token for use with reqwest
pub
fn
cancellation_token
(
&
self
)
->
CancellationToken
{
self
.cancel_token
.clone
()
}
/// Get the elapsed time since context creation
pub
fn
elapsed
(
&
self
)
->
std
::
time
::
Duration
{
self
.created_at
.elapsed
()
}
}
impl
Default
for
HttpRequestContext
{
fn
default
()
->
Self
{
Self
::
new
()
}
}
impl
std
::
fmt
::
Debug
for
HttpRequestContext
{
fn
fmt
(
&
self
,
f
:
&
mut
std
::
fmt
::
Formatter
<
'_
>
)
->
std
::
fmt
::
Result
{
f
.debug_struct
(
"HttpRequestContext"
)
.field
(
"id"
,
&
self
.id
)
.field
(
"created_at"
,
&
self
.created_at
)
.field
(
"is_stopped"
,
&
self
.is_stopped
())
.field
(
"is_killed"
,
&
self
.is_killed
())
.field
(
"is_cancelled"
,
&
self
.cancel_token
.is_cancelled
())
.finish
()
}
}
#[async_trait]
impl
AsyncEngineContext
for
HttpRequestContext
{
fn
id
(
&
self
)
->
&
str
{
&
self
.id
}
fn
stop
(
&
self
)
{
self
.stopped
.store
(
true
,
std
::
sync
::
atomic
::
Ordering
::
Release
);
self
.cancel_token
.cancel
();
}
fn
stop_generating
(
&
self
)
{
// For HTTP clients, stop_generating is the same as stop
self
.stop
();
}
fn
kill
(
&
self
)
{
self
.stopped
.store
(
true
,
std
::
sync
::
atomic
::
Ordering
::
Release
);
self
.cancel_token
.cancel
();
}
fn
is_stopped
(
&
self
)
->
bool
{
self
.stopped
.load
(
std
::
sync
::
atomic
::
Ordering
::
Acquire
)
}
fn
is_killed
(
&
self
)
->
bool
{
self
.stopped
.load
(
std
::
sync
::
atomic
::
Ordering
::
Acquire
)
}
async
fn
stopped
(
&
self
)
{
self
.cancel_token
.cancelled
()
.await
;
}
async
fn
killed
(
&
self
)
{
// For HTTP clients, killed is the same as stopped
self
.cancel_token
.cancelled
()
.await
;
}
}
/// Base HTTP client with common functionality
pub
struct
BaseHttpClient
{
/// async-openai client
client
:
Client
<
OpenAIConfig
>
,
/// Client configuration
config
:
HttpClientConfig
,
/// Root context for this client
root_context
:
HttpRequestContext
,
}
impl
BaseHttpClient
{
/// Create a new base HTTP client
pub
fn
new
(
config
:
HttpClientConfig
)
->
Self
{
let
client
=
Client
::
with_config
(
config
.openai_config
.clone
());
Self
{
client
,
config
,
root_context
:
HttpRequestContext
::
new
(),
}
}
/// Get a reference to the underlying async-openai client
pub
fn
client
(
&
self
)
->
&
Client
<
OpenAIConfig
>
{
&
self
.client
}
/// Create a new request context as a child of the root context
pub
fn
create_context
(
&
self
)
->
HttpRequestContext
{
self
.root_context
.child
()
}
/// Create a new request context with a specific ID as a child of the root context
pub
fn
create_context_with_id
(
&
self
,
id
:
String
)
->
HttpRequestContext
{
self
.root_context
.child_with_id
(
id
)
}
/// Get the root context for this client
pub
fn
root_context
(
&
self
)
->
&
HttpRequestContext
{
&
self
.root_context
}
/// Check if verbose logging is enabled
pub
fn
is_verbose
(
&
self
)
->
bool
{
self
.config.verbose
}
}
/// Type alias for NV chat response stream
pub
type
NvChatResponseStream
=
Pin
<
Box
<
dyn
Stream
<
Item
=
Result
<
Annotated
<
NvCreateChatCompletionStreamResponse
>
,
OpenAIError
>>
+
Send
+
Sync
,
>
,
>
;
/// Type alias for generic BYOT response stream
pub
type
ByotResponseStream
=
Pin
<
Box
<
dyn
Stream
<
Item
=
Result
<
Value
,
OpenAIError
>>
+
Send
+
Sync
>>
;
/// Type alias for pure OpenAI chat response stream
pub
type
OpenAIChatResponseStream
=
Pin
<
Box
<
dyn
Stream
<
Item
=
Result
<
async_openai
::
types
::
CreateChatCompletionStreamResponse
,
OpenAIError
>
,
>
+
Send
+
Sync
,
>
,
>
;
/// A wrapped HTTP response stream that combines a stream with its context
/// This provides a unified interface for HTTP client responses
#[derive(Dissolve)]
pub
struct
HttpResponseStream
<
T
>
{
/// The underlying stream of responses
pub
stream
:
Pin
<
Box
<
dyn
Stream
<
Item
=
T
>
+
Send
>>
,
/// The context for this request
pub
context
:
Arc
<
dyn
AsyncEngineContext
>
,
}
impl
<
T
>
HttpResponseStream
<
T
>
{
/// Create a new HttpResponseStream
pub
fn
new
(
stream
:
Pin
<
Box
<
dyn
Stream
<
Item
=
T
>
+
Send
>>
,
context
:
Arc
<
dyn
AsyncEngineContext
>
,
)
->
Self
{
Self
{
stream
,
context
}
}
}
impl
<
T
:
Data
>
Stream
for
HttpResponseStream
<
T
>
{
type
Item
=
T
;
fn
poll_next
(
mut
self
:
Pin
<&
mut
Self
>
,
cx
:
&
mut
Context
<
'_
>
)
->
Poll
<
Option
<
Self
::
Item
>>
{
Pin
::
new
(
&
mut
self
.stream
)
.poll_next
(
cx
)
}
}
impl
<
T
:
Data
>
AsyncEngineContextProvider
for
HttpResponseStream
<
T
>
{
fn
context
(
&
self
)
->
Arc
<
dyn
AsyncEngineContext
>
{
self
.context
.clone
()
}
}
impl
<
T
:
Data
>
HttpResponseStream
<
T
>
{
/// Convert this HttpResponseStream to a Pin<Box<dyn AsyncEngineStream<T>>>
/// This requires the stream to be Send + Sync, which may not be true for all streams
pub
fn
into_async_engine_stream
(
self
)
->
Pin
<
Box
<
dyn
AsyncEngineStream
<
T
>>>
where
T
:
'static
,
{
// This will only work if the underlying stream is actually Send + Sync
// For now, we create a wrapper that assumes this
Box
::
pin
(
AsyncEngineStreamWrapper
{
stream
:
self
.stream
,
context
:
self
.context
,
})
}
}
/// A wrapper that implements AsyncEngineStream for streams that are Send + Sync
struct
AsyncEngineStreamWrapper
<
T
>
{
stream
:
Pin
<
Box
<
dyn
Stream
<
Item
=
T
>
+
Send
>>
,
context
:
Arc
<
dyn
AsyncEngineContext
>
,
}
impl
<
T
:
Data
>
Stream
for
AsyncEngineStreamWrapper
<
T
>
{
type
Item
=
T
;
fn
poll_next
(
mut
self
:
Pin
<&
mut
Self
>
,
cx
:
&
mut
Context
<
'_
>
)
->
Poll
<
Option
<
Self
::
Item
>>
{
Pin
::
new
(
&
mut
self
.stream
)
.poll_next
(
cx
)
}
}
impl
<
T
:
Data
>
AsyncEngineContextProvider
for
AsyncEngineStreamWrapper
<
T
>
{
fn
context
(
&
self
)
->
Arc
<
dyn
AsyncEngineContext
>
{
self
.context
.clone
()
}
}
// This is unsafe because we're claiming the stream is Sync when it might not be
// But this is needed for the AsyncEngineStream trait
unsafe
impl
<
T
>
Sync
for
AsyncEngineStreamWrapper
<
T
>
{}
impl
<
T
:
Data
>
AsyncEngineStream
<
T
>
for
AsyncEngineStreamWrapper
<
T
>
{}
impl
<
T
>
std
::
fmt
::
Debug
for
AsyncEngineStreamWrapper
<
T
>
{
fn
fmt
(
&
self
,
f
:
&
mut
std
::
fmt
::
Formatter
<
'_
>
)
->
std
::
fmt
::
Result
{
f
.debug_struct
(
"AsyncEngineStreamWrapper"
)
.field
(
"context"
,
&
self
.context
)
.finish
()
}
}
impl
<
T
:
Data
>
std
::
fmt
::
Debug
for
HttpResponseStream
<
T
>
{
fn
fmt
(
&
self
,
f
:
&
mut
std
::
fmt
::
Formatter
<
'_
>
)
->
std
::
fmt
::
Result
{
f
.debug_struct
(
"HttpResponseStream"
)
.field
(
"context"
,
&
self
.context
)
.finish
()
}
}
/// Type alias for HttpResponseStream with NV chat completion responses
pub
type
NvHttpResponseStream
=
HttpResponseStream
<
Result
<
Annotated
<
NvCreateChatCompletionStreamResponse
>
,
OpenAIError
>>
;
/// Type alias for HttpResponseStream with BYOT responses
pub
type
ByotHttpResponseStream
=
HttpResponseStream
<
Result
<
Value
,
OpenAIError
>>
;
/// Type alias for HttpResponseStream with pure OpenAI responses
pub
type
OpenAIHttpResponseStream
=
HttpResponseStream
<
Result
<
async_openai
::
types
::
CreateChatCompletionStreamResponse
,
OpenAIError
>
,
>
;
/// Pure OpenAI client using standard async-openai types
pub
struct
PureOpenAIClient
{
base
:
BaseHttpClient
,
}
impl
PureOpenAIClient
{
/// Create a new pure OpenAI client
pub
fn
new
(
config
:
HttpClientConfig
)
->
Self
{
Self
{
base
:
BaseHttpClient
::
new
(
config
),
}
}
/// Create streaming chat completions using standard OpenAI types
/// Uses a client-managed context
pub
async
fn
chat_stream
(
&
self
,
request
:
async_openai
::
types
::
CreateChatCompletionRequest
,
)
->
Result
<
OpenAIHttpResponseStream
,
HttpClientError
>
{
let
ctx
=
self
.base
.create_context
();
self
.chat_stream_with_context
(
request
,
ctx
)
.await
}
/// Create streaming chat completions with a custom context
pub
async
fn
chat_stream_with_context
(
&
self
,
request
:
async_openai
::
types
::
CreateChatCompletionRequest
,
context
:
HttpRequestContext
,
)
->
Result
<
OpenAIHttpResponseStream
,
HttpClientError
>
{
let
ctx_arc
:
Arc
<
dyn
AsyncEngineContext
>
=
Arc
::
new
(
context
.clone
());
if
!
request
.stream
.unwrap_or
(
false
)
{
return
Err
(
HttpClientError
::
InvalidRequest
(
"chat_stream requires the request to have 'stream': true"
.to_string
(),
));
}
if
self
.base
.is_verbose
()
{
tracing
::
info!
(
"Starting pure OpenAI chat stream for request {}"
,
context
.id
()
);
}
// Create the stream with cancellation support
let
stream
=
self
.base
.client
()
.chat
()
.create_stream
(
request
)
.await
.map_err
(
HttpClientError
::
OpenAI
)
?
;
// TODO: In Phase 3, we'll add cancellation integration with reqwest
// For now, return the stream as-is
Ok
(
HttpResponseStream
::
new
(
stream
,
ctx_arc
))
}
}
/// NV Custom client using NvCreateChatCompletionRequest with Annotated responses
pub
struct
NvCustomClient
{
base
:
BaseHttpClient
,
}
impl
NvCustomClient
{
/// Create a new NV custom client
pub
fn
new
(
config
:
HttpClientConfig
)
->
Self
{
Self
{
base
:
BaseHttpClient
::
new
(
config
),
}
}
/// Create streaming chat completions using NV custom types
/// Uses a client-managed context
pub
async
fn
chat_stream
(
&
self
,
request
:
NvCreateChatCompletionRequest
,
)
->
Result
<
NvHttpResponseStream
,
HttpClientError
>
{
let
ctx
=
self
.base
.create_context
();
self
.chat_stream_with_context
(
request
,
ctx
)
.await
}
/// Create streaming chat completions with a custom context
pub
async
fn
chat_stream_with_context
(
&
self
,
request
:
NvCreateChatCompletionRequest
,
context
:
HttpRequestContext
,
)
->
Result
<
NvHttpResponseStream
,
HttpClientError
>
{
let
ctx_arc
:
Arc
<
dyn
AsyncEngineContext
>
=
Arc
::
new
(
context
.clone
());
if
!
request
.inner.stream
.unwrap_or
(
false
)
{
return
Err
(
HttpClientError
::
InvalidRequest
(
"chat_stream requires the request to have 'stream': true"
.to_string
(),
));
}
if
self
.base
.is_verbose
()
{
tracing
::
info!
(
"Starting NV custom chat stream for request {}"
,
context
.id
()
);
}
// Use BYOT feature to send NvCreateChatCompletionRequest
// The stream type is explicitly specified to deserialize directly into Annotated<NvCreateChatCompletionStreamResponse>
let
stream
=
self
.base
.client
()
.chat
()
.create_stream_byot
(
request
)
.await
.map_err
(
HttpClientError
::
OpenAI
)
?
;
Ok
(
HttpResponseStream
::
new
(
stream
,
ctx_arc
))
}
}
/// Generic BYOT client using serde_json::Value for maximum flexibility
pub
struct
GenericBYOTClient
{
base
:
BaseHttpClient
,
}
impl
GenericBYOTClient
{
/// Create a new generic BYOT client
pub
fn
new
(
config
:
HttpClientConfig
)
->
Self
{
Self
{
base
:
BaseHttpClient
::
new
(
config
),
}
}
/// Create streaming chat completions using arbitrary JSON values
/// Uses a client-managed context
pub
async
fn
chat_stream
(
&
self
,
request
:
Value
,
)
->
Result
<
ByotHttpResponseStream
,
HttpClientError
>
{
let
ctx
=
self
.base
.create_context
();
self
.chat_stream_with_context
(
request
,
ctx
)
.await
}
/// Create streaming chat completions with a custom context
pub
async
fn
chat_stream_with_context
(
&
self
,
request
:
Value
,
context
:
HttpRequestContext
,
)
->
Result
<
ByotHttpResponseStream
,
HttpClientError
>
{
let
ctx_arc
:
Arc
<
dyn
AsyncEngineContext
>
=
Arc
::
new
(
context
.clone
());
if
self
.base
.is_verbose
()
{
tracing
::
info!
(
"Starting generic BYOT chat stream for request {}"
,
context
.id
()
);
}
// Validate that the request has stream: true
if
let
Some
(
stream_val
)
=
request
.get
(
"stream"
)
{
if
!
stream_val
.as_bool
()
.unwrap_or
(
false
)
{
return
Err
(
HttpClientError
::
InvalidRequest
(
"Request must have 'stream': true for streaming"
.to_string
(),
));
}
}
else
{
return
Err
(
HttpClientError
::
InvalidRequest
(
"Request must include 'stream' field"
.to_string
(),
));
}
// Use BYOT feature with raw JSON
// The stream type is explicitly specified to deserialize directly into serde_json::Value
let
stream
=
self
.base
.client
()
.chat
()
.create_stream_byot
(
request
)
.await
.map_err
(
HttpClientError
::
OpenAI
)
?
;
Ok
(
HttpResponseStream
::
new
(
stream
,
ctx_arc
))
}
}
// TODO: Implement recording integration in Phase 3:
// - Recording wrapper functions
// - Capacity hints from request parameters
// - Integration with existing recording infrastructure
#[cfg(test)]
mod
tests
{
use
super
::
*
;
use
tokio
::
time
::{
sleep
,
Duration
};
#[tokio::test]
async
fn
test_http_request_context_creation
()
{
let
ctx
=
HttpRequestContext
::
new
();
assert
!
(
!
ctx
.id
()
.is_empty
());
assert
!
(
!
ctx
.is_stopped
());
assert
!
(
!
ctx
.is_killed
());
}
#[tokio::test]
async
fn
test_http_request_context_child
()
{
let
parent
=
HttpRequestContext
::
new
();
let
child
=
parent
.child
();
// Child should have different ID
assert_ne!
(
parent
.id
(),
child
.id
());
// Child should not be stopped initially
assert
!
(
!
child
.is_stopped
());
// When parent is stopped, child should be cancelled via token
parent
.stop
();
assert
!
(
parent
.is_stopped
());
assert
!
(
child
.cancellation_token
()
.is_cancelled
());
}
#[tokio::test]
async
fn
test_http_request_context_child_with_id
()
{
let
parent
=
HttpRequestContext
::
new
();
let
child_id
=
"test-child"
;
let
child
=
parent
.child_with_id
(
child_id
.to_string
());
assert_eq!
(
child
.id
(),
child_id
);
assert
!
(
!
child
.is_stopped
());
// Test hierarchical cancellation
parent
.stop
();
assert
!
(
child
.cancellation_token
()
.is_cancelled
());
}
#[tokio::test]
async
fn
test_http_request_context_cancellation
()
{
let
ctx
=
HttpRequestContext
::
new
();
let
cancel_token
=
ctx
.cancellation_token
();
// Test stop functionality
assert
!
(
!
ctx
.is_stopped
());
ctx
.stop
();
assert
!
(
ctx
.is_stopped
());
assert
!
(
cancel_token
.is_cancelled
());
}
#[tokio::test]
async
fn
test_http_request_context_kill
()
{
let
ctx
=
HttpRequestContext
::
new
();
// Test kill functionality
assert
!
(
!
ctx
.is_killed
());
ctx
.kill
();
assert
!
(
ctx
.is_killed
());
assert
!
(
ctx
.is_stopped
());
}
#[tokio::test]
async
fn
test_http_request_context_async_cancellation
()
{
let
ctx
=
HttpRequestContext
::
new
();
// Test async cancellation
let
ctx_clone
=
ctx
.clone
();
let
task
=
tokio
::
spawn
(
async
move
{
ctx_clone
.stopped
()
.await
;
});
// Give a moment for the task to start waiting
sleep
(
Duration
::
from_millis
(
10
))
.await
;
// Cancel the context
ctx
.stop
();
// The task should complete
task
.await
.unwrap
();
}
#[test]
fn
test_base_http_client_creation
()
{
let
config
=
HttpClientConfig
::
default
();
let
client
=
BaseHttpClient
::
new
(
config
);
assert
!
(
!
client
.is_verbose
());
// Test that client has a root context
assert
!
(
!
client
.root_context
()
.id
()
.is_empty
());
}
#[test]
fn
test_base_http_client_context_creation
()
{
let
config
=
HttpClientConfig
::
default
();
let
client
=
BaseHttpClient
::
new
(
config
);
// Test creating child contexts
let
ctx1
=
client
.create_context
();
let
ctx2
=
client
.create_context
();
// Should have different IDs
assert_ne!
(
ctx1
.id
(),
ctx2
.id
());
// Should be children of root context
client
.root_context
()
.stop
();
assert
!
(
ctx1
.cancellation_token
()
.is_cancelled
());
assert
!
(
ctx2
.cancellation_token
()
.is_cancelled
());
}
#[test]
fn
test_base_http_client_context_with_id
()
{
let
config
=
HttpClientConfig
::
default
();
let
client
=
BaseHttpClient
::
new
(
config
);
let
custom_id
=
"custom-request-id"
;
let
ctx
=
client
.create_context_with_id
(
custom_id
.to_string
());
assert_eq!
(
ctx
.id
(),
custom_id
);
// Should still be child of root
client
.root_context
()
.stop
();
assert
!
(
ctx
.cancellation_token
()
.is_cancelled
());
}
#[test]
fn
test_http_client_config_defaults
()
{
let
config
=
HttpClientConfig
::
default
();
assert
!
(
!
config
.verbose
);
}
#[test]
fn
test_pure_openai_client_creation
()
{
let
config
=
HttpClientConfig
::
default
();
let
_
client
=
PureOpenAIClient
::
new
(
config
);
// If we get here, creation succeeded
}
#[test]
fn
test_nv_custom_client_creation
()
{
let
config
=
HttpClientConfig
::
default
();
let
_
client
=
NvCustomClient
::
new
(
config
);
// If we get here, creation succeeded
}
#[test]
fn
test_generic_byot_client_creation
()
{
let
config
=
HttpClientConfig
::
default
();
let
_
client
=
GenericBYOTClient
::
new
(
config
);
// If we get here, creation succeeded
}
}
lib/llm/src/lib.rs
View file @
a9e0891c
...
...
@@ -25,6 +25,7 @@ pub mod local_model;
pub
mod
mocker
;
pub
mod
model_card
;
pub
mod
model_type
;
pub
mod
perf
;
pub
mod
preprocessor
;
pub
mod
protocols
;
pub
mod
recorder
;
...
...
lib/llm/src/perf.rs
0 → 100644
View file @
a9e0891c
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Performance recording and analysis for streaming LLM responses
//!
//! This module provides mechanisms to record streaming responses with minimal overhead
//! during collection, then analyze the recorded data for performance insights.
use
futures
::
Stream
;
use
std
::
pin
::
Pin
;
use
std
::
task
::{
Context
,
Poll
};
use
std
::
time
::{
Duration
,
Instant
};
use
tokio
::
sync
::
oneshot
;
// Import the runtime types we need
use
dynamo_runtime
::
engine
::{
AsyncEngineContext
,
AsyncEngineContextProvider
,
AsyncEngineStream
,
Data
,
DataStream
,
EngineStream
,
ResponseStream
,
};
use
std
::
sync
::
Arc
;
/// Type alias for a receiver of recorded stream data
pub
type
RecordedStreamReceiver
<
R
>
=
oneshot
::
Receiver
<
RecordedStream
<
R
>>
;
/// Type alias for the return type of recording functions
pub
type
RecordingResult
<
R
>
=
(
EngineStream
<
R
>
,
RecordedStreamReceiver
<
R
>
);
/// A response wrapper that adds timing information with minimal overhead
#[derive(Debug,
Clone)]
pub
struct
TimestampedResponse
<
T
>
{
/// The actual response data
pub
response
:
T
,
/// High-resolution timestamp when this response was recorded
pub
timestamp
:
Instant
,
/// Sequence number in the stream (0-based)
pub
sequence_number
:
usize
,
}
impl
<
T
>
TimestampedResponse
<
T
>
{
/// Create a new timestamped response
pub
fn
new
(
response
:
T
,
sequence_number
:
usize
)
->
Self
{
Self
{
response
,
timestamp
:
Instant
::
now
(),
sequence_number
,
}
}
/// Get the response data
pub
fn
data
(
&
self
)
->
&
T
{
&
self
.response
}
/// Get the elapsed time since stream start
pub
fn
elapsed_since
(
&
self
,
start_time
:
Instant
)
->
Duration
{
self
.timestamp
.duration_since
(
start_time
)
}
}
/// Trait for requests that can provide hints about expected response count
/// This enables capacity pre-allocation for better performance
pub
trait
CapacityHint
{
/// Estimate the number of responses this request might generate
/// Returns None if estimation is not possible
fn
estimated_response_count
(
&
self
)
->
Option
<
usize
>
;
}
/// Recording mode determines how the recorder behaves with the stream
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq)]
pub
enum
RecordingMode
{
/// Pass responses through while recording (scan mode)
/// Stream continues to flow to downstream consumers
Scan
,
/// Consume responses as terminus (sink mode)
/// Stream ends at the recorder
Sink
,
}
/// Container for recorded streaming responses.
/// This forms the core object on which analysis is performed.
#[derive(Debug,
Clone)]
pub
struct
RecordedStream
<
T
>
{
/// All recorded responses with timestamps
responses
:
Vec
<
TimestampedResponse
<
T
>>
,
/// When recording started
start_time
:
Instant
,
/// When recording ended
end_time
:
Instant
,
}
impl
<
T
>
RecordedStream
<
T
>
{
/// Create a new recorded stream from collected responses
pub
fn
new
(
responses
:
Vec
<
TimestampedResponse
<
T
>>
,
start_time
:
Instant
,
end_time
:
Instant
,
)
->
Self
{
Self
{
responses
,
start_time
,
end_time
,
}
}
/// Get the number of responses recorded
pub
fn
response_count
(
&
self
)
->
usize
{
self
.responses
.len
()
}
/// Get the total duration of the stream
pub
fn
total_duration
(
&
self
)
->
Duration
{
self
.end_time
.duration_since
(
self
.start_time
)
}
/// Get the responses recorded
pub
fn
responses
(
&
self
)
->
&
[
TimestampedResponse
<
T
>
]
{
&
self
.responses
}
/// Get the start time of the stream
pub
fn
start_time
(
&
self
)
->
&
Instant
{
&
self
.start_time
}
/// Get the end time of the stream
pub
fn
end_time
(
&
self
)
->
&
Instant
{
&
self
.end_time
}
}
/// Recording stream that wraps an AsyncEngineStream and records responses
/// Following the pattern of ResponseStream for AsyncEngine compatibility
pub
struct
RecordingStream
<
R
:
Data
>
{
/// The wrapped stream
stream
:
DataStream
<
R
>
,
/// Context from the original stream
ctx
:
Arc
<
dyn
AsyncEngineContext
>
,
/// Recording mode
mode
:
RecordingMode
,
/// Recorded responses
responses
:
Vec
<
TimestampedResponse
<
R
>>
,
/// When recording started
start_time
:
Instant
,
/// Channel to send recorded data when stream completes
recorded_tx
:
Option
<
oneshot
::
Sender
<
RecordedStream
<
R
>>>
,
}
impl
<
R
:
Data
>
Unpin
for
RecordingStream
<
R
>
{}
impl
<
R
:
Data
+
Clone
>
RecordingStream
<
R
>
{
/// Create a new recording stream from a raw stream and context
pub
fn
from_stream_and_context
(
stream
:
DataStream
<
R
>
,
ctx
:
Arc
<
dyn
AsyncEngineContext
>
,
mode
:
RecordingMode
,
capacity
:
Option
<
usize
>
,
recorded_tx
:
oneshot
::
Sender
<
RecordedStream
<
R
>>
,
)
->
Self
{
let
mut
responses
=
Vec
::
new
();
if
let
Some
(
cap
)
=
capacity
{
responses
.reserve
(
cap
);
}
Self
{
stream
,
ctx
,
mode
,
responses
,
start_time
:
Instant
::
now
(),
recorded_tx
:
Some
(
recorded_tx
),
}
}
/// Create a new recording stream from an AsyncEngineStream (private constructor)
fn
from_async_engine_stream
(
stream
:
EngineStream
<
R
>
,
mode
:
RecordingMode
,
capacity
:
Option
<
usize
>
,
recorded_tx
:
oneshot
::
Sender
<
RecordedStream
<
R
>>
,
)
->
Self
{
let
ctx
=
stream
.context
();
Self
::
from_stream_and_context
(
stream
,
ctx
,
mode
,
capacity
,
recorded_tx
)
}
/// Convert to Pin<Box<dyn AsyncEngineStream<R>>>
pub
fn
into_async_engine_stream
(
self
)
->
EngineStream
<
R
>
{
Box
::
pin
(
self
)
}
}
impl
<
R
:
Data
+
Clone
>
Stream
for
RecordingStream
<
R
>
{
type
Item
=
R
;
fn
poll_next
(
mut
self
:
Pin
<&
mut
Self
>
,
cx
:
&
mut
Context
<
'_
>
)
->
Poll
<
Option
<
Self
::
Item
>>
{
let
this
=
self
.as_mut
()
.get_mut
();
match
Pin
::
new
(
&
mut
this
.stream
)
.poll_next
(
cx
)
{
Poll
::
Ready
(
Some
(
item
))
=>
{
// Always capture timestamp first (cheap operation)
let
timestamp
=
Instant
::
now
();
let
sequence_number
=
this
.responses
.len
();
match
this
.mode
{
RecordingMode
::
Scan
=>
{
// Clone for recording, pass original through
let
timestamped
=
TimestampedResponse
{
response
:
item
.clone
(),
timestamp
,
sequence_number
,
};
this
.responses
.push
(
timestamped
);
Poll
::
Ready
(
Some
(
item
))
// Pass through original
}
RecordingMode
::
Sink
=>
{
// Move item directly into recording (no clone needed)
let
timestamped
=
TimestampedResponse
{
response
:
item
,
// Move, don't clone
timestamp
,
sequence_number
,
};
this
.responses
.push
(
timestamped
);
// Continue consuming but don't emit
// self.poll_next(cx)
cx
.waker
()
.wake_by_ref
();
Poll
::
Pending
}
}
}
Poll
::
Ready
(
None
)
=>
{
// Stream ended - send recorded data
if
let
Some
(
tx
)
=
this
.recorded_tx
.take
()
{
let
recorded
=
RecordedStream
::
new
(
std
::
mem
::
take
(
&
mut
this
.responses
),
this
.start_time
,
Instant
::
now
(),
);
let
_
=
tx
.send
(
recorded
);
// Ignore if receiver dropped
}
Poll
::
Ready
(
None
)
}
Poll
::
Pending
=>
Poll
::
Pending
,
}
}
}
impl
<
R
:
Data
+
Clone
>
AsyncEngineStream
<
R
>
for
RecordingStream
<
R
>
{}
impl
<
R
:
Data
+
Clone
>
AsyncEngineContextProvider
for
RecordingStream
<
R
>
{
fn
context
(
&
self
)
->
Arc
<
dyn
AsyncEngineContext
>
{
self
.ctx
.clone
()
}
}
impl
<
R
:
Data
+
Clone
>
std
::
fmt
::
Debug
for
RecordingStream
<
R
>
{
fn
fmt
(
&
self
,
f
:
&
mut
std
::
fmt
::
Formatter
<
'_
>
)
->
std
::
fmt
::
Result
{
f
.debug_struct
(
"RecordingStream"
)
.field
(
"mode"
,
&
self
.mode
)
.field
(
"responses_count"
,
&
self
.responses
.len
())
.field
(
"ctx"
,
&
self
.ctx
)
.finish
()
}
}
/// Create a recording stream that wraps an AsyncEngineStream
/// Returns a pinned stream and a receiver for the recorded data
pub
fn
record_stream
<
R
:
Data
+
Clone
>
(
stream
:
EngineStream
<
R
>
,
mode
:
RecordingMode
,
)
->
RecordingResult
<
R
>
{
let
(
tx
,
rx
)
=
oneshot
::
channel
();
let
recording_stream
=
RecordingStream
::
from_async_engine_stream
(
stream
,
mode
,
None
,
tx
);
let
boxed_stream
=
Box
::
pin
(
recording_stream
);
(
boxed_stream
,
rx
)
}
/// Create a recording stream from a raw stream and context
/// Returns a pinned stream and a receiver for the recorded data
pub
fn
record_stream_with_context
<
R
:
Data
+
Clone
>
(
stream
:
DataStream
<
R
>
,
ctx
:
Arc
<
dyn
AsyncEngineContext
>
,
mode
:
RecordingMode
,
)
->
RecordingResult
<
R
>
{
let
(
tx
,
rx
)
=
oneshot
::
channel
();
let
recording_stream
=
RecordingStream
::
from_stream_and_context
(
stream
,
ctx
,
mode
,
None
,
tx
);
let
boxed_stream
=
Box
::
pin
(
recording_stream
);
(
boxed_stream
,
rx
)
}
/// Create a recording stream with capacity hint
pub
fn
record_stream_with_capacity
<
R
:
Data
+
Clone
>
(
stream
:
EngineStream
<
R
>
,
mode
:
RecordingMode
,
capacity
:
usize
,
)
->
RecordingResult
<
R
>
{
let
(
tx
,
rx
)
=
oneshot
::
channel
();
let
recording_stream
=
RecordingStream
::
from_async_engine_stream
(
stream
,
mode
,
Some
(
capacity
),
tx
);
let
boxed_stream
=
Box
::
pin
(
recording_stream
);
(
boxed_stream
,
rx
)
}
/// Create a recording stream with capacity hint from request
pub
fn
record_stream_with_request_hint
<
R
:
Data
+
Clone
,
Req
:
CapacityHint
>
(
stream
:
EngineStream
<
R
>
,
mode
:
RecordingMode
,
request
:
&
Req
,
)
->
RecordingResult
<
R
>
{
let
capacity
=
request
.estimated_response_count
();
match
capacity
{
Some
(
cap
)
=>
record_stream_with_capacity
(
stream
,
mode
,
cap
),
None
=>
record_stream
(
stream
,
mode
),
}
}
/// Create a recording stream from a raw stream and context with capacity hint
pub
fn
record_stream_with_context_and_capacity
<
R
:
Data
+
Clone
>
(
stream
:
DataStream
<
R
>
,
ctx
:
Arc
<
dyn
AsyncEngineContext
>
,
mode
:
RecordingMode
,
capacity
:
usize
,
)
->
RecordingResult
<
R
>
{
let
(
tx
,
rx
)
=
oneshot
::
channel
();
let
recording_stream
=
RecordingStream
::
from_stream_and_context
(
stream
,
ctx
,
mode
,
Some
(
capacity
),
tx
);
let
boxed_stream
=
Box
::
pin
(
recording_stream
);
(
boxed_stream
,
rx
)
}
/// Create a recording stream from ResponseStream (convenience wrapper)
pub
fn
record_response_stream
<
R
:
Data
+
Clone
>
(
response_stream
:
Pin
<
Box
<
ResponseStream
<
R
>>>
,
mode
:
RecordingMode
,
)
->
RecordingResult
<
R
>
{
record_stream
(
response_stream
,
mode
)
}
#[cfg(test)]
mod
tests
{
use
super
::
*
;
use
dynamo_runtime
::
engine
::
ResponseStream
;
use
futures
::
stream
;
use
std
::
time
::
Duration
;
#[test]
fn
test_timestamped_response_creation
()
{
let
response
=
"test response"
;
let
timestamped
=
TimestampedResponse
::
new
(
response
,
0
);
assert_eq!
(
timestamped
.response
,
response
);
assert_eq!
(
timestamped
.sequence_number
,
0
);
assert_eq!
(
timestamped
.data
(),
&
response
);
}
#[test]
fn
test_recorded_stream_analysis
()
{
let
start_time
=
Instant
::
now
();
// Create mock responses with known timing
let
responses
=
vec!
[
TimestampedResponse
{
response
:
"response1"
,
timestamp
:
start_time
,
sequence_number
:
0
,
},
TimestampedResponse
{
response
:
"response2"
,
timestamp
:
start_time
+
Duration
::
from_millis
(
100
),
sequence_number
:
1
,
},
TimestampedResponse
{
response
:
"response3"
,
timestamp
:
start_time
+
Duration
::
from_millis
(
250
),
sequence_number
:
2
,
},
];
let
end_time
=
start_time
+
Duration
::
from_millis
(
250
);
let
recorded
=
RecordedStream
::
new
(
responses
,
start_time
,
end_time
);
assert_eq!
(
recorded
.response_count
(),
3
);
assert_eq!
(
recorded
.total_duration
(),
Duration
::
from_millis
(
250
));
}
#[test]
fn
test_performance_metrics_conversion
()
{
let
start_time
=
Instant
::
now
();
let
responses
=
vec!
[
TimestampedResponse
{
response
:
"test"
,
timestamp
:
start_time
+
Duration
::
from_millis
(
50
),
sequence_number
:
0
,
},
TimestampedResponse
{
response
:
"test"
,
timestamp
:
start_time
+
Duration
::
from_millis
(
150
),
sequence_number
:
1
,
},
];
let
end_time
=
start_time
+
Duration
::
from_millis
(
150
);
let
recorded
=
RecordedStream
::
new
(
responses
,
start_time
,
end_time
);
assert_eq!
(
recorded
.response_count
(),
2
);
assert_eq!
(
recorded
.total_duration
(),
Duration
::
from_millis
(
150
));
}
#[tokio::test]
async
fn
test_recording_stream_scan_mode
()
{
use
futures
::
StreamExt
;
// Create a simple test stream
let
test_data
=
vec!
[
"token1"
,
"token2"
,
"token3"
];
let
base_stream
=
stream
::
iter
(
test_data
.clone
());
// Create a mock context for the stream
let
ctx
=
Arc
::
new
(
MockContext
::
new
());
// Record the stream in scan mode using the simplified API
let
(
recorded_stream
,
recording_rx
)
=
record_stream_with_context
(
Box
::
pin
(
base_stream
),
ctx
,
RecordingMode
::
Scan
);
// Consume the stream normally (pass-through mode)
let
collected_responses
:
Vec
<
_
>
=
recorded_stream
.collect
()
.await
;
// Verify the responses passed through unchanged
assert_eq!
(
collected_responses
,
test_data
);
// Get the recorded data
let
recorded
=
recording_rx
.await
.unwrap
();
assert_eq!
(
recorded
.response_count
(),
3
);
assert_eq!
(
recorded
.responses
[
0
]
.response
,
"token1"
);
assert_eq!
(
recorded
.responses
[
1
]
.response
,
"token2"
);
assert_eq!
(
recorded
.responses
[
2
]
.response
,
"token3"
);
// Verify timing was recorded
assert
!
(
recorded
.total_duration
()
>
Duration
::
from_nanos
(
0
));
}
#[tokio::test]
async
fn
test_recording_stream_sink_mode
()
{
use
futures
::
StreamExt
;
// Create a simple test stream
let
test_data
=
vec!
[
"token1"
,
"token2"
,
"token3"
];
let
base_stream
=
stream
::
iter
(
test_data
.clone
());
// Create a mock context for the stream
let
ctx
=
Arc
::
new
(
MockContext
::
new
());
// Record the stream in sink mode using the simplified API
let
(
recorded_stream
,
recording_rx
)
=
record_stream_with_context
(
Box
::
pin
(
base_stream
),
ctx
,
RecordingMode
::
Sink
);
// In sink mode, the stream should complete without emitting items
let
collected_responses
:
Vec
<
_
>
=
recorded_stream
.collect
()
.await
;
assert_eq!
(
collected_responses
,
Vec
::
<&
str
>
::
new
());
// Get the recorded data - should contain all original items
let
recorded
=
recording_rx
.await
.unwrap
();
assert_eq!
(
recorded
.response_count
(),
3
);
assert_eq!
(
recorded
.responses
[
0
]
.response
,
"token1"
);
assert_eq!
(
recorded
.responses
[
1
]
.response
,
"token2"
);
assert_eq!
(
recorded
.responses
[
2
]
.response
,
"token3"
);
// Verify timing was recorded
assert
!
(
recorded
.total_duration
()
>
Duration
::
from_nanos
(
0
));
}
#[tokio::test]
async
fn
test_recording_stream_from_response_stream
()
{
use
futures
::
StreamExt
;
// Create a simple test stream
let
test_data
=
vec!
[
"token1"
,
"token2"
,
"token3"
];
let
base_stream
=
stream
::
iter
(
test_data
.clone
());
// Create a ResponseStream (the traditional way)
let
ctx
=
Arc
::
new
(
MockContext
::
new
());
let
response_stream
=
ResponseStream
::
new
(
Box
::
pin
(
base_stream
),
ctx
);
// Use the convenience API for ResponseStream
let
(
recorded_stream
,
recording_rx
)
=
record_response_stream
(
response_stream
,
RecordingMode
::
Scan
);
// Consume the stream normally (pass-through mode)
let
collected_responses
:
Vec
<
_
>
=
recorded_stream
.collect
()
.await
;
// Verify the responses passed through unchanged
assert_eq!
(
collected_responses
,
test_data
);
// Get the recorded data
let
recorded
=
recording_rx
.await
.unwrap
();
assert_eq!
(
recorded
.response_count
(),
3
);
assert_eq!
(
recorded
.responses
[
0
]
.response
,
"token1"
);
assert_eq!
(
recorded
.responses
[
1
]
.response
,
"token2"
);
assert_eq!
(
recorded
.responses
[
2
]
.response
,
"token3"
);
// Verify timing was recorded
assert
!
(
recorded
.total_duration
()
>
Duration
::
from_nanos
(
0
));
}
// Mock context for testing
#[derive(Debug)]
struct
MockContext
{
id
:
String
,
}
impl
MockContext
{
fn
new
()
->
Self
{
Self
{
id
:
"test-context"
.to_string
(),
}
}
}
#[async_trait::async_trait]
impl
AsyncEngineContext
for
MockContext
{
fn
id
(
&
self
)
->
&
str
{
&
self
.id
}
fn
stop
(
&
self
)
{
// No-op for testing
}
fn
stop_generating
(
&
self
)
{
// No-op for testing
}
fn
kill
(
&
self
)
{
// No-op for testing
}
fn
is_stopped
(
&
self
)
->
bool
{
false
}
fn
is_killed
(
&
self
)
->
bool
{
false
}
async
fn
stopped
(
&
self
)
{
// No-op for testing
}
async
fn
killed
(
&
self
)
{
// No-op for testing
}
}
}
lib/llm/tests/http-service.rs
View file @
a9e0891c
...
...
@@ -14,12 +14,18 @@
// limitations under the License.
use
anyhow
::
Error
;
use
async_openai
::
config
::
OpenAIConfig
;
use
async_stream
::
stream
;
use
dynamo_llm
::
http
::
service
::{
error
::
HttpError
,
metrics
::{
Endpoint
,
RequestType
,
Status
},
service_v2
::
HttpService
,
Metrics
,
use
dynamo_llm
::
http
::{
client
::{
GenericBYOTClient
,
HttpClientConfig
,
HttpRequestContext
,
NvCustomClient
,
PureOpenAIClient
,
},
service
::{
error
::
HttpError
,
metrics
::{
Endpoint
,
RequestType
,
Status
},
service_v2
::
HttpService
,
Metrics
,
},
};
use
dynamo_llm
::
protocols
::{
openai
::{
...
...
@@ -29,13 +35,16 @@ use dynamo_llm::protocols::{
Annotated
,
};
use
dynamo_runtime
::{
engine
::
AsyncEngineContext
,
pipeline
::{
async_trait
,
AsyncEngine
,
AsyncEngineContextProvider
,
ManyOut
,
ResponseStream
,
SingleIn
,
},
CancellationToken
,
};
use
futures
::
StreamExt
;
use
prometheus
::{
proto
::
MetricType
,
Registry
};
use
reqwest
::
StatusCode
;
use
rstest
::
*
;
use
std
::
sync
::
Arc
;
struct
CounterEngine
{}
...
...
@@ -470,3 +479,404 @@ async fn test_http_service() {
cancel_token
.cancel
();
task
.await
.unwrap
()
.unwrap
();
}
// === HTTP Client Tests ===
/// Wait for the HTTP service to be ready by checking its health endpoint
async
fn
wait_for_service_ready
(
port
:
u16
)
{
let
start
=
tokio
::
time
::
Instant
::
now
();
let
timeout
=
tokio
::
time
::
Duration
::
from_secs
(
5
);
loop
{
match
reqwest
::
get
(
&
format!
(
"http://localhost:{}/health"
,
port
))
.await
{
Ok
(
_
)
=>
break
,
Err
(
_
)
if
start
.elapsed
()
<
timeout
=>
{
tokio
::
time
::
sleep
(
tokio
::
time
::
Duration
::
from_millis
(
10
))
.await
;
}
Err
(
e
)
=>
panic!
(
"Service failed to start within timeout: {}"
,
e
),
}
}
}
#[fixture]
fn
service_with_engines
(
#[default(
8990
)]
port
:
u16
,
)
->
(
HttpService
,
Arc
<
CounterEngine
>
,
Arc
<
AlwaysFailEngine
>
)
{
let
service
=
HttpService
::
builder
()
.port
(
port
)
.build
()
.unwrap
();
let
manager
=
service
.model_manager
();
let
counter
=
Arc
::
new
(
CounterEngine
{});
let
failure
=
Arc
::
new
(
AlwaysFailEngine
{});
manager
.add_chat_completions_model
(
"foo"
,
counter
.clone
())
.unwrap
();
manager
.add_chat_completions_model
(
"bar"
,
failure
.clone
())
.unwrap
();
manager
.add_completions_model
(
"bar"
,
failure
.clone
())
.unwrap
();
(
service
,
counter
,
failure
)
}
#[fixture]
fn
pure_openai_client
(
#[default(
8990
)]
port
:
u16
)
->
PureOpenAIClient
{
let
config
=
HttpClientConfig
{
openai_config
:
OpenAIConfig
::
new
()
.with_api_base
(
format!
(
"http://localhost:{}/v1"
,
port
)),
verbose
:
false
,
};
PureOpenAIClient
::
new
(
config
)
}
#[fixture]
fn
nv_custom_client
(
#[default(
8991
)]
port
:
u16
)
->
NvCustomClient
{
let
config
=
HttpClientConfig
{
openai_config
:
OpenAIConfig
::
new
()
.with_api_base
(
format!
(
"http://localhost:{}/v1"
,
port
)),
verbose
:
false
,
};
NvCustomClient
::
new
(
config
)
}
#[fixture]
fn
generic_byot_client
(
#[default(
8992
)]
port
:
u16
)
->
GenericBYOTClient
{
let
config
=
HttpClientConfig
{
openai_config
:
OpenAIConfig
::
new
()
.with_api_base
(
format!
(
"http://localhost:{}/v1"
,
port
)),
verbose
:
false
,
};
GenericBYOTClient
::
new
(
config
)
}
#[rstest]
#[tokio::test]
async
fn
test_pure_openai_client
(
#[with(
8990
)]
service_with_engines
:
(
HttpService
,
Arc
<
CounterEngine
>
,
Arc
<
AlwaysFailEngine
>
),
#[with(
8990
)]
pure_openai_client
:
PureOpenAIClient
,
)
{
let
(
service
,
_
counter
,
_
failure
)
=
service_with_engines
;
let
token
=
CancellationToken
::
new
();
let
cancel_token
=
token
.clone
();
// Start the service
let
task
=
tokio
::
spawn
(
async
move
{
service
.run
(
token
)
.await
});
// Wait for service to be ready
wait_for_service_ready
(
8990
)
.await
;
// Test successful streaming request
let
request
=
async_openai
::
types
::
CreateChatCompletionRequestArgs
::
default
()
.model
(
"foo"
)
.messages
(
vec!
[
async_openai
::
types
::
ChatCompletionRequestMessage
::
User
(
async_openai
::
types
::
ChatCompletionRequestUserMessage
{
content
:
async_openai
::
types
::
ChatCompletionRequestUserMessageContent
::
Text
(
"Hi"
.to_string
(),
),
name
:
None
,
},
),
])
.stream
(
true
)
.max_tokens
(
50u32
)
.build
()
.unwrap
();
let
result
=
pure_openai_client
.chat_stream
(
request
)
.await
;
assert
!
(
result
.is_ok
(),
"PureOpenAI client should succeed"
);
let
(
mut
stream
,
_
context
)
=
result
.unwrap
()
.dissolve
();
let
mut
count
=
0
;
while
let
Some
(
response
)
=
stream
.next
()
.await
{
count
+=
1
;
assert
!
(
response
.is_ok
(),
"Response should be ok"
);
if
count
>=
3
{
break
;
// Don't consume entire stream
}
}
assert
!
(
count
>
0
,
"Should receive at least one response"
);
// Test error case with invalid model
let
request
=
async_openai
::
types
::
CreateChatCompletionRequestArgs
::
default
()
.model
(
"bar"
)
// This model will fail
.messages
(
vec!
[
async_openai
::
types
::
ChatCompletionRequestMessage
::
User
(
async_openai
::
types
::
ChatCompletionRequestUserMessage
{
content
:
async_openai
::
types
::
ChatCompletionRequestUserMessageContent
::
Text
(
"Hi"
.to_string
(),
),
name
:
None
,
},
),
])
.stream
(
true
)
.max_tokens
(
50u32
)
.build
()
.unwrap
();
let
result
=
pure_openai_client
.chat_stream
(
request
)
.await
;
assert
!
(
result
.is_ok
(),
"Client should return stream even for failing model"
);
let
(
mut
stream
,
_
context
)
=
result
.unwrap
()
.dissolve
();
if
let
Some
(
response
)
=
stream
.next
()
.await
{
assert
!
(
response
.is_err
(),
"Response should be error for failing model"
);
}
// Test context management
let
ctx
=
HttpRequestContext
::
new
();
let
request
=
async_openai
::
types
::
CreateChatCompletionRequestArgs
::
default
()
.model
(
"foo"
)
.messages
(
vec!
[
async_openai
::
types
::
ChatCompletionRequestMessage
::
User
(
async_openai
::
types
::
ChatCompletionRequestUserMessage
{
content
:
async_openai
::
types
::
ChatCompletionRequestUserMessageContent
::
Text
(
"Hi"
.to_string
(),
),
name
:
None
,
},
),
])
.stream
(
true
)
.max_tokens
(
50u32
)
.build
()
.unwrap
();
let
result
=
pure_openai_client
.chat_stream_with_context
(
request
,
ctx
.clone
())
.await
;
assert
!
(
result
.is_ok
(),
"Context-based request should succeed"
);
let
(
_
stream
,
context
)
=
result
.unwrap
()
.dissolve
();
assert_eq!
(
context
.id
(),
ctx
.id
(),
"Context ID should match"
);
cancel_token
.cancel
();
task
.await
.unwrap
()
.unwrap
();
}
#[rstest]
#[tokio::test]
async
fn
test_nv_custom_client
(
#[with(
8991
)]
service_with_engines
:
(
HttpService
,
Arc
<
CounterEngine
>
,
Arc
<
AlwaysFailEngine
>
),
#[with(
8991
)]
nv_custom_client
:
NvCustomClient
,
)
{
let
(
service
,
_
counter
,
_
failure
)
=
service_with_engines
;
let
token
=
CancellationToken
::
new
();
let
cancel_token
=
token
.clone
();
// Start the service
let
task
=
tokio
::
spawn
(
async
move
{
service
.run
(
token
)
.await
});
// Wait for service to be ready
wait_for_service_ready
(
8991
)
.await
;
// Test successful streaming request
let
inner_request
=
async_openai
::
types
::
CreateChatCompletionRequestArgs
::
default
()
.model
(
"foo"
)
.messages
(
vec!
[
async_openai
::
types
::
ChatCompletionRequestMessage
::
User
(
async_openai
::
types
::
ChatCompletionRequestUserMessage
{
content
:
async_openai
::
types
::
ChatCompletionRequestUserMessageContent
::
Text
(
"Hi"
.to_string
(),
),
name
:
None
,
},
),
])
.stream
(
true
)
.max_tokens
(
50u32
)
.build
()
.unwrap
();
let
request
=
NvCreateChatCompletionRequest
{
inner
:
inner_request
,
nvext
:
None
,
};
let
result
=
nv_custom_client
.chat_stream
(
request
)
.await
;
assert
!
(
result
.is_ok
(),
"NvCustom client should succeed"
);
let
(
mut
stream
,
_
context
)
=
result
.unwrap
()
.dissolve
();
let
mut
count
=
0
;
while
let
Some
(
response
)
=
stream
.next
()
.await
{
count
+=
1
;
assert
!
(
response
.is_ok
(),
"Response should be ok"
);
if
count
>=
3
{
break
;
// Don't consume entire stream
}
}
assert
!
(
count
>
0
,
"Should receive at least one response"
);
// Test error case with invalid model
let
inner_request
=
async_openai
::
types
::
CreateChatCompletionRequestArgs
::
default
()
.model
(
"bar"
)
// This model will fail
.messages
(
vec!
[
async_openai
::
types
::
ChatCompletionRequestMessage
::
User
(
async_openai
::
types
::
ChatCompletionRequestUserMessage
{
content
:
async_openai
::
types
::
ChatCompletionRequestUserMessageContent
::
Text
(
"Hi"
.to_string
(),
),
name
:
None
,
},
),
])
.stream
(
true
)
.max_tokens
(
50u32
)
.build
()
.unwrap
();
let
request
=
NvCreateChatCompletionRequest
{
inner
:
inner_request
,
nvext
:
None
,
};
let
result
=
nv_custom_client
.chat_stream
(
request
)
.await
;
assert
!
(
result
.is_ok
(),
"Client should return stream even for failing model"
);
let
(
mut
stream
,
_
context
)
=
result
.unwrap
()
.dissolve
();
if
let
Some
(
response
)
=
stream
.next
()
.await
{
assert
!
(
response
.is_err
(),
"Response should be error for failing model"
);
}
// Test context management
let
ctx
=
HttpRequestContext
::
new
();
let
inner_request
=
async_openai
::
types
::
CreateChatCompletionRequestArgs
::
default
()
.model
(
"foo"
)
.messages
(
vec!
[
async_openai
::
types
::
ChatCompletionRequestMessage
::
User
(
async_openai
::
types
::
ChatCompletionRequestUserMessage
{
content
:
async_openai
::
types
::
ChatCompletionRequestUserMessageContent
::
Text
(
"Hi"
.to_string
(),
),
name
:
None
,
},
),
])
.stream
(
true
)
.max_tokens
(
50u32
)
.build
()
.unwrap
();
let
request
=
NvCreateChatCompletionRequest
{
inner
:
inner_request
,
nvext
:
None
,
};
let
result
=
nv_custom_client
.chat_stream_with_context
(
request
,
ctx
.clone
())
.await
;
assert
!
(
result
.is_ok
(),
"Context-based request should succeed"
);
let
(
_
stream
,
context
)
=
result
.unwrap
()
.dissolve
();
assert_eq!
(
context
.id
(),
ctx
.id
(),
"Context ID should match"
);
cancel_token
.cancel
();
task
.await
.unwrap
()
.unwrap
();
}
#[rstest]
#[tokio::test]
async
fn
test_generic_byot_client
(
#[with(
8992
)]
service_with_engines
:
(
HttpService
,
Arc
<
CounterEngine
>
,
Arc
<
AlwaysFailEngine
>
),
#[with(
8992
)]
generic_byot_client
:
GenericBYOTClient
,
)
{
let
(
service
,
_
counter
,
_
failure
)
=
service_with_engines
;
let
token
=
CancellationToken
::
new
();
let
cancel_token
=
token
.clone
();
// Start the service
let
task
=
tokio
::
spawn
(
async
move
{
service
.run
(
token
)
.await
});
// Wait for service to be ready
wait_for_service_ready
(
8992
)
.await
;
// Test successful streaming request
let
request
=
serde_json
::
json!
({
"model"
:
"foo"
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Hi"
}
],
"stream"
:
true
,
"max_tokens"
:
50
});
let
result
=
generic_byot_client
.chat_stream
(
request
)
.await
;
assert
!
(
result
.is_ok
(),
"GenericBYOT client should succeed"
);
let
(
mut
stream
,
_
context
)
=
result
.unwrap
()
.dissolve
();
let
mut
count
=
0
;
while
let
Some
(
response
)
=
stream
.next
()
.await
{
println!
(
"Response: {:?}"
,
response
);
count
+=
1
;
assert
!
(
response
.is_ok
(),
"Response should be ok"
);
if
count
>=
3
{
break
;
// Don't consume entire stream
}
}
assert
!
(
count
>
0
,
"Should receive at least one response"
);
// Test error case with invalid model
let
request
=
serde_json
::
json!
({
"model"
:
"bar"
,
// This model will fail
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Hi"
}
],
"stream"
:
true
,
"max_tokens"
:
50
});
let
result
=
generic_byot_client
.chat_stream
(
request
)
.await
;
assert
!
(
result
.is_ok
(),
"Client should return stream even for failing model"
);
let
(
mut
stream
,
_
context
)
=
result
.unwrap
()
.dissolve
();
if
let
Some
(
response
)
=
stream
.next
()
.await
{
assert
!
(
response
.is_err
(),
"Response should be error for failing model"
);
}
// Test context management
let
ctx
=
HttpRequestContext
::
new
();
let
request
=
serde_json
::
json!
({
"model"
:
"foo"
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Hi"
}
],
"stream"
:
true
,
"max_tokens"
:
50
});
let
result
=
generic_byot_client
.chat_stream_with_context
(
request
,
ctx
.clone
())
.await
;
assert
!
(
result
.is_ok
(),
"Context-based request should succeed"
);
let
(
_
stream
,
context
)
=
result
.unwrap
()
.dissolve
();
assert_eq!
(
context
.id
(),
ctx
.id
(),
"Context ID should match"
);
cancel_token
.cancel
();
task
.await
.unwrap
()
.unwrap
();
}
lib/runtime/src/engine.rs
View file @
a9e0891c
...
...
@@ -165,7 +165,7 @@ pub trait AsyncEngineContext: Send + Sync + Debug {
///
/// This trait is implemented by both unary and streaming engine results, allowing
/// uniform access to context information regardless of the operation type.
pub
trait
AsyncEngineContextProvider
:
Send
+
Sync
+
Debug
{
pub
trait
AsyncEngineContextProvider
:
Send
+
Debug
{
fn
context
(
&
self
)
->
Arc
<
dyn
AsyncEngineContext
>
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment