feat: adding http clients and recorded response stream (#1919)

a9e0891c · Ryan Olson · GitHub · 4128d583 · a9e0891c · a9e0891c
Unverified Commit a9e0891c authored Jul 15, 2025 by Ryan Olson Committed by GitHub Jul 15, 2025
7 changed files
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,7 +45,7 @@ dynamo-tokens = { path = "lib/tokens", version = "0.3.2" }
 # External dependencies
 anyhow = { version = "1" }
 async-nats = { version = "0.40", features = ["service"] }
-async-openai = { version = "0.29.0" }
+async-openai = { version = "0.29.0", features = ["rustls", "byot"] }
 async-stream = { version = "0.3" }
 async-trait = { version = "0.1" }
 async_zmq = { version = "0.4.0" }

--- a/lib/llm/src/http.rs
+++ b/lib/llm/src/http.rs
@@ -13,4 +13,5 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+pub mod client;
 pub mod service;
--- a/lib/llm/src/http/client.rs
+++ b/lib/llm/src/http/client.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! HTTP clients for streaming LLM responses with performance recording
+//!
+//! This module provides HTTP clients that leverage async-openai with BYOT (Bring Your Own Types)
+//! feature to work with OpenAI-compatible APIs. The clients support recording streaming responses
+//! for performance analysis.
+
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use std::time::Instant;
+
+use async_openai::{config::OpenAIConfig, error::OpenAIError, Client};
+use async_trait::async_trait;
+use derive_getters::Dissolve;
+use futures::Stream;
+use serde_json::Value;
+use tokio_util::sync::CancellationToken;
+use tracing;
+use uuid::Uuid;
+
+// Import our existing recording infrastructure
+use crate::protocols::openai::chat_completions::{
+    NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
+};
+use crate::protocols::Annotated;
+use dynamo_runtime::engine::{
+    AsyncEngineContext, AsyncEngineContextProvider, AsyncEngineStream, Data,
+};
+
+/// Configuration for HTTP clients
+#[derive(Clone, Default)]
+pub struct HttpClientConfig {
+    /// OpenAI API configuration
+    pub openai_config: OpenAIConfig,
+    /// Whether to enable detailed logging
+    pub verbose: bool,
+}
+
+/// Error types for HTTP clients
+#[derive(Debug, thiserror::Error)]
+pub enum HttpClientError {
+    #[error("OpenAI API error: {0}")]
+    OpenAI(#[from] OpenAIError),
+    #[error("Request timeout")]
+    Timeout,
+    #[error("Request cancelled")]
+    Cancelled,
+    #[error("Invalid request: {0}")]
+    InvalidRequest(String),
+}
+
+/// Context for HTTP client requests that supports cancellation
+/// This bridges AsyncEngineContext and reqwest cancellation
+#[derive(Clone)]
+pub struct HttpRequestContext {
+    /// Unique request identifier
+    id: String,
+    /// Tokio cancellation token for reqwest integration
+    cancel_token: CancellationToken,
+    /// When this context was created
+    created_at: Instant,
+    /// Whether the request has been stopped
+    stopped: Arc<std::sync::atomic::AtomicBool>,
+}
+
+impl HttpRequestContext {
+    /// Create a new HTTP request context
+    pub fn new() -> Self {
+        Self {
+            id: Uuid::new_v4().to_string(),
+            cancel_token: CancellationToken::new(),
+            created_at: Instant::now(),
+            stopped: Arc::new(std::sync::atomic::AtomicBool::new(false)),
+        }
+    }
+
+    /// Create a new context with a specific ID
+    pub fn with_id(id: String) -> Self {
+        Self {
+            id,
+            cancel_token: CancellationToken::new(),
+            created_at: Instant::now(),
+            stopped: Arc::new(std::sync::atomic::AtomicBool::new(false)),
+        }
+    }
+
+    /// Create a child context from this parent context
+    /// The child will be cancelled when the parent is cancelled
+    pub fn child(&self) -> Self {
+        Self {
+            id: Uuid::new_v4().to_string(),
+            cancel_token: self.cancel_token.child_token(),
+            created_at: Instant::now(),
+            stopped: Arc::new(std::sync::atomic::AtomicBool::new(false)),
+        }
+    }
+
+    /// Create a child context with a specific ID
+    pub fn child_with_id(&self, id: String) -> Self {
+        Self {
+            id,
+            cancel_token: self.cancel_token.child_token(),
+            created_at: Instant::now(),
+            stopped: Arc::new(std::sync::atomic::AtomicBool::new(false)),
+        }
+    }
+
+    /// Get the cancellation token for use with reqwest
+    pub fn cancellation_token(&self) -> CancellationToken {
+        self.cancel_token.clone()
+    }
+
+    /// Get the elapsed time since context creation
+    pub fn elapsed(&self) -> std::time::Duration {
+        self.created_at.elapsed()
+    }
+}
+
+impl Default for HttpRequestContext {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl std::fmt::Debug for HttpRequestContext {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("HttpRequestContext")
+            .field("id", &self.id)
+            .field("created_at", &self.created_at)
+            .field("is_stopped", &self.is_stopped())
+            .field("is_killed", &self.is_killed())
+            .field("is_cancelled", &self.cancel_token.is_cancelled())
+            .finish()
+    }
+}
+
+#[async_trait]
+impl AsyncEngineContext for HttpRequestContext {
+    fn id(&self) -> &str {
+        &self.id
+    }
+
+    fn stop(&self) {
+        self.stopped
+            .store(true, std::sync::atomic::Ordering::Release);
+        self.cancel_token.cancel();
+    }
+
+    fn stop_generating(&self) {
+        // For HTTP clients, stop_generating is the same as stop
+        self.stop();
+    }
+
+    fn kill(&self) {
+        self.stopped
+            .store(true, std::sync::atomic::Ordering::Release);
+        self.cancel_token.cancel();
+    }
+
+    fn is_stopped(&self) -> bool {
+        self.stopped.load(std::sync::atomic::Ordering::Acquire)
+    }
+
+    fn is_killed(&self) -> bool {
+        self.stopped.load(std::sync::atomic::Ordering::Acquire)
+    }
+
+    async fn stopped(&self) {
+        self.cancel_token.cancelled().await;
+    }
+
+    async fn killed(&self) {
+        // For HTTP clients, killed is the same as stopped
+        self.cancel_token.cancelled().await;
+    }
+}
+
+/// Base HTTP client with common functionality
+pub struct BaseHttpClient {
+    /// async-openai client
+    client: Client<OpenAIConfig>,
+    /// Client configuration
+    config: HttpClientConfig,
+    /// Root context for this client
+    root_context: HttpRequestContext,
+}
+
+impl BaseHttpClient {
+    /// Create a new base HTTP client
+    pub fn new(config: HttpClientConfig) -> Self {
+        let client = Client::with_config(config.openai_config.clone());
+        Self {
+            client,
+            config,
+            root_context: HttpRequestContext::new(),
+        }
+    }
+
+    /// Get a reference to the underlying async-openai client
+    pub fn client(&self) -> &Client<OpenAIConfig> {
+        &self.client
+    }
+
+    /// Create a new request context as a child of the root context
+    pub fn create_context(&self) -> HttpRequestContext {
+        self.root_context.child()
+    }
+
+    /// Create a new request context with a specific ID as a child of the root context
+    pub fn create_context_with_id(&self, id: String) -> HttpRequestContext {
+        self.root_context.child_with_id(id)
+    }
+
+    /// Get the root context for this client
+    pub fn root_context(&self) -> &HttpRequestContext {
+        &self.root_context
+    }
+
+    /// Check if verbose logging is enabled
+    pub fn is_verbose(&self) -> bool {
+        self.config.verbose
+    }
+}
+
+/// Type alias for NV chat response stream
+pub type NvChatResponseStream = Pin<
+    Box<
+        dyn Stream<Item = Result<Annotated<NvCreateChatCompletionStreamResponse>, OpenAIError>>
+            + Send
+            + Sync,
+    >,
+>;
+
+/// Type alias for generic BYOT response stream
+pub type ByotResponseStream = Pin<Box<dyn Stream<Item = Result<Value, OpenAIError>> + Send + Sync>>;
+
+/// Type alias for pure OpenAI chat response stream
+pub type OpenAIChatResponseStream = Pin<
+    Box<
+        dyn Stream<
+                Item = Result<async_openai::types::CreateChatCompletionStreamResponse, OpenAIError>,
+            > + Send
+            + Sync,
+    >,
+>;
+
+/// A wrapped HTTP response stream that combines a stream with its context
+/// This provides a unified interface for HTTP client responses
+#[derive(Dissolve)]
+pub struct HttpResponseStream<T> {
+    /// The underlying stream of responses
+    pub stream: Pin<Box<dyn Stream<Item = T> + Send>>,
+    /// The context for this request
+    pub context: Arc<dyn AsyncEngineContext>,
+}
+
+impl<T> HttpResponseStream<T> {
+    /// Create a new HttpResponseStream
+    pub fn new(
+        stream: Pin<Box<dyn Stream<Item = T> + Send>>,
+        context: Arc<dyn AsyncEngineContext>,
+    ) -> Self {
+        Self { stream, context }
+    }
+}
+
+impl<T: Data> Stream for HttpResponseStream<T> {
+    type Item = T;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.stream).poll_next(cx)
+    }
+}
+
+impl<T: Data> AsyncEngineContextProvider for HttpResponseStream<T> {
+    fn context(&self) -> Arc<dyn AsyncEngineContext> {
+        self.context.clone()
+    }
+}
+
+impl<T: Data> HttpResponseStream<T> {
+    /// Convert this HttpResponseStream to a Pin<Box<dyn AsyncEngineStream<T>>>
+    /// This requires the stream to be Send + Sync, which may not be true for all streams
+    pub fn into_async_engine_stream(self) -> Pin<Box<dyn AsyncEngineStream<T>>>
+    where
+        T: 'static,
+    {
+        // This will only work if the underlying stream is actually Send + Sync
+        // For now, we create a wrapper that assumes this
+        Box::pin(AsyncEngineStreamWrapper {
+            stream: self.stream,
+            context: self.context,
+        })
+    }
+}
+
+/// A wrapper that implements AsyncEngineStream for streams that are Send + Sync
+struct AsyncEngineStreamWrapper<T> {
+    stream: Pin<Box<dyn Stream<Item = T> + Send>>,
+    context: Arc<dyn AsyncEngineContext>,
+}
+
+impl<T: Data> Stream for AsyncEngineStreamWrapper<T> {
+    type Item = T;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.stream).poll_next(cx)
+    }
+}
+
+impl<T: Data> AsyncEngineContextProvider for AsyncEngineStreamWrapper<T> {
+    fn context(&self) -> Arc<dyn AsyncEngineContext> {
+        self.context.clone()
+    }
+}
+
+// This is unsafe because we're claiming the stream is Sync when it might not be
+// But this is needed for the AsyncEngineStream trait
+unsafe impl<T> Sync for AsyncEngineStreamWrapper<T> {}
+
+impl<T: Data> AsyncEngineStream<T> for AsyncEngineStreamWrapper<T> {}
+
+impl<T> std::fmt::Debug for AsyncEngineStreamWrapper<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("AsyncEngineStreamWrapper")
+            .field("context", &self.context)
+            .finish()
+    }
+}
+
+impl<T: Data> std::fmt::Debug for HttpResponseStream<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("HttpResponseStream")
+            .field("context", &self.context)
+            .finish()
+    }
+}
+
+/// Type alias for HttpResponseStream with NV chat completion responses
+pub type NvHttpResponseStream =
+    HttpResponseStream<Result<Annotated<NvCreateChatCompletionStreamResponse>, OpenAIError>>;
+
+/// Type alias for HttpResponseStream with BYOT responses
+pub type ByotHttpResponseStream = HttpResponseStream<Result<Value, OpenAIError>>;
+
+/// Type alias for HttpResponseStream with pure OpenAI responses
+pub type OpenAIHttpResponseStream = HttpResponseStream<
+    Result<async_openai::types::CreateChatCompletionStreamResponse, OpenAIError>,
+>;
+
+/// Pure OpenAI client using standard async-openai types
+pub struct PureOpenAIClient {
+    base: BaseHttpClient,
+}
+
+impl PureOpenAIClient {
+    /// Create a new pure OpenAI client
+    pub fn new(config: HttpClientConfig) -> Self {
+        Self {
+            base: BaseHttpClient::new(config),
+        }
+    }
+
+    /// Create streaming chat completions using standard OpenAI types
+    /// Uses a client-managed context
+    pub async fn chat_stream(
+        &self,
+        request: async_openai::types::CreateChatCompletionRequest,
+    ) -> Result<OpenAIHttpResponseStream, HttpClientError> {
+        let ctx = self.base.create_context();
+        self.chat_stream_with_context(request, ctx).await
+    }
+
+    /// Create streaming chat completions with a custom context
+    pub async fn chat_stream_with_context(
+        &self,
+        request: async_openai::types::CreateChatCompletionRequest,
+        context: HttpRequestContext,
+    ) -> Result<OpenAIHttpResponseStream, HttpClientError> {
+        let ctx_arc: Arc<dyn AsyncEngineContext> = Arc::new(context.clone());
+
+        if !request.stream.unwrap_or(false) {
+            return Err(HttpClientError::InvalidRequest(
+                "chat_stream requires the request to have 'stream': true".to_string(),
+            ));
+        }
+
+        if self.base.is_verbose() {
+            tracing::info!(
+                "Starting pure OpenAI chat stream for request {}",
+                context.id()
+            );
+        }
+
+        // Create the stream with cancellation support
+        let stream = self
+            .base
+            .client()
+            .chat()
+            .create_stream(request)
+            .await
+            .map_err(HttpClientError::OpenAI)?;
+
+        // TODO: In Phase 3, we'll add cancellation integration with reqwest
+        // For now, return the stream as-is
+        Ok(HttpResponseStream::new(stream, ctx_arc))
+    }
+}
+
+/// NV Custom client using NvCreateChatCompletionRequest with Annotated responses
+pub struct NvCustomClient {
+    base: BaseHttpClient,
+}
+
+impl NvCustomClient {
+    /// Create a new NV custom client
+    pub fn new(config: HttpClientConfig) -> Self {
+        Self {
+            base: BaseHttpClient::new(config),
+        }
+    }
+
+    /// Create streaming chat completions using NV custom types
+    /// Uses a client-managed context
+    pub async fn chat_stream(
+        &self,
+        request: NvCreateChatCompletionRequest,
+    ) -> Result<NvHttpResponseStream, HttpClientError> {
+        let ctx = self.base.create_context();
+        self.chat_stream_with_context(request, ctx).await
+    }
+
+    /// Create streaming chat completions with a custom context
+    pub async fn chat_stream_with_context(
+        &self,
+        request: NvCreateChatCompletionRequest,
+        context: HttpRequestContext,
+    ) -> Result<NvHttpResponseStream, HttpClientError> {
+        let ctx_arc: Arc<dyn AsyncEngineContext> = Arc::new(context.clone());
+
+        if !request.inner.stream.unwrap_or(false) {
+            return Err(HttpClientError::InvalidRequest(
+                "chat_stream requires the request to have 'stream': true".to_string(),
+            ));
+        }
+
+        if self.base.is_verbose() {
+            tracing::info!(
+                "Starting NV custom chat stream for request {}",
+                context.id()
+            );
+        }
+
+        // Use BYOT feature to send NvCreateChatCompletionRequest
+        // The stream type is explicitly specified to deserialize directly into Annotated<NvCreateChatCompletionStreamResponse>
+        let stream = self
+            .base
+            .client()
+            .chat()
+            .create_stream_byot(request)
+            .await
+            .map_err(HttpClientError::OpenAI)?;
+
+        Ok(HttpResponseStream::new(stream, ctx_arc))
+    }
+}
+
+/// Generic BYOT client using serde_json::Value for maximum flexibility
+pub struct GenericBYOTClient {
+    base: BaseHttpClient,
+}
+
+impl GenericBYOTClient {
+    /// Create a new generic BYOT client
+    pub fn new(config: HttpClientConfig) -> Self {
+        Self {
+            base: BaseHttpClient::new(config),
+        }
+    }
+
+    /// Create streaming chat completions using arbitrary JSON values
+    /// Uses a client-managed context
+    pub async fn chat_stream(
+        &self,
+        request: Value,
+    ) -> Result<ByotHttpResponseStream, HttpClientError> {
+        let ctx = self.base.create_context();
+        self.chat_stream_with_context(request, ctx).await
+    }
+
+    /// Create streaming chat completions with a custom context
+    pub async fn chat_stream_with_context(
+        &self,
+        request: Value,
+        context: HttpRequestContext,
+    ) -> Result<ByotHttpResponseStream, HttpClientError> {
+        let ctx_arc: Arc<dyn AsyncEngineContext> = Arc::new(context.clone());
+
+        if self.base.is_verbose() {
+            tracing::info!(
+                "Starting generic BYOT chat stream for request {}",
+                context.id()
+            );
+        }
+
+        // Validate that the request has stream: true
+        if let Some(stream_val) = request.get("stream") {
+            if !stream_val.as_bool().unwrap_or(false) {
+                return Err(HttpClientError::InvalidRequest(
+                    "Request must have 'stream': true for streaming".to_string(),
+                ));
+            }
+        } else {
+            return Err(HttpClientError::InvalidRequest(
+                "Request must include 'stream' field".to_string(),
+            ));
+        }
+
+        // Use BYOT feature with raw JSON
+        // The stream type is explicitly specified to deserialize directly into serde_json::Value
+        let stream = self
+            .base
+            .client()
+            .chat()
+            .create_stream_byot(request)
+            .await
+            .map_err(HttpClientError::OpenAI)?;
+
+        Ok(HttpResponseStream::new(stream, ctx_arc))
+    }
+}
+
+// TODO: Implement recording integration in Phase 3:
+// - Recording wrapper functions
+// - Capacity hints from request parameters
+// - Integration with existing recording infrastructure
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tokio::time::{sleep, Duration};
+
+    #[tokio::test]
+    async fn test_http_request_context_creation() {
+        let ctx = HttpRequestContext::new();
+        assert!(!ctx.id().is_empty());
+        assert!(!ctx.is_stopped());
+        assert!(!ctx.is_killed());
+    }
+
+    #[tokio::test]
+    async fn test_http_request_context_child() {
+        let parent = HttpRequestContext::new();
+        let child = parent.child();
+
+        // Child should have different ID
+        assert_ne!(parent.id(), child.id());
+
+        // Child should not be stopped initially
+        assert!(!child.is_stopped());
+
+        // When parent is stopped, child should be cancelled via token
+        parent.stop();
+        assert!(parent.is_stopped());
+        assert!(child.cancellation_token().is_cancelled());
+    }
+
+    #[tokio::test]
+    async fn test_http_request_context_child_with_id() {
+        let parent = HttpRequestContext::new();
+        let child_id = "test-child";
+        let child = parent.child_with_id(child_id.to_string());
+
+        assert_eq!(child.id(), child_id);
+        assert!(!child.is_stopped());
+
+        // Test hierarchical cancellation
+        parent.stop();
+        assert!(child.cancellation_token().is_cancelled());
+    }
+
+    #[tokio::test]
+    async fn test_http_request_context_cancellation() {
+        let ctx = HttpRequestContext::new();
+        let cancel_token = ctx.cancellation_token();
+
+        // Test stop functionality
+        assert!(!ctx.is_stopped());
+        ctx.stop();
+        assert!(ctx.is_stopped());
+        assert!(cancel_token.is_cancelled());
+    }
+
+    #[tokio::test]
+    async fn test_http_request_context_kill() {
+        let ctx = HttpRequestContext::new();
+
+        // Test kill functionality
+        assert!(!ctx.is_killed());
+        ctx.kill();
+        assert!(ctx.is_killed());
+        assert!(ctx.is_stopped());
+    }
+
+    #[tokio::test]
+    async fn test_http_request_context_async_cancellation() {
+        let ctx = HttpRequestContext::new();
+
+        // Test async cancellation
+        let ctx_clone = ctx.clone();
+        let task = tokio::spawn(async move {
+            ctx_clone.stopped().await;
+        });
+
+        // Give a moment for the task to start waiting
+        sleep(Duration::from_millis(10)).await;
+
+        // Cancel the context
+        ctx.stop();
+
+        // The task should complete
+        task.await.unwrap();
+    }
+
+    #[test]
+    fn test_base_http_client_creation() {
+        let config = HttpClientConfig::default();
+        let client = BaseHttpClient::new(config);
+        assert!(!client.is_verbose());
+
+        // Test that client has a root context
+        assert!(!client.root_context().id().is_empty());
+    }
+
+    #[test]
+    fn test_base_http_client_context_creation() {
+        let config = HttpClientConfig::default();
+        let client = BaseHttpClient::new(config);
+
+        // Test creating child contexts
+        let ctx1 = client.create_context();
+        let ctx2 = client.create_context();
+
+        // Should have different IDs
+        assert_ne!(ctx1.id(), ctx2.id());
+
+        // Should be children of root context
+        client.root_context().stop();
+        assert!(ctx1.cancellation_token().is_cancelled());
+        assert!(ctx2.cancellation_token().is_cancelled());
+    }
+
+    #[test]
+    fn test_base_http_client_context_with_id() {
+        let config = HttpClientConfig::default();
+        let client = BaseHttpClient::new(config);
+
+        let custom_id = "custom-request-id";
+        let ctx = client.create_context_with_id(custom_id.to_string());
+
+        assert_eq!(ctx.id(), custom_id);
+
+        // Should still be child of root
+        client.root_context().stop();
+        assert!(ctx.cancellation_token().is_cancelled());
+    }
+
+    #[test]
+    fn test_http_client_config_defaults() {
+        let config = HttpClientConfig::default();
+        assert!(!config.verbose);
+    }
+
+    #[test]
+    fn test_pure_openai_client_creation() {
+        let config = HttpClientConfig::default();
+        let _client = PureOpenAIClient::new(config);
+        // If we get here, creation succeeded
+    }
+
+    #[test]
+    fn test_nv_custom_client_creation() {
+        let config = HttpClientConfig::default();
+        let _client = NvCustomClient::new(config);
+        // If we get here, creation succeeded
+    }
+
+    #[test]
+    fn test_generic_byot_client_creation() {
+        let config = HttpClientConfig::default();
+        let _client = GenericBYOTClient::new(config);
+        // If we get here, creation succeeded
+    }
+}
--- a/lib/llm/src/lib.rs
+++ b/lib/llm/src/lib.rs
@@ -25,6 +25,7 @@ pub mod local_model;
 pub mod mocker;
 pub mod model_card;
 pub mod model_type;
+pub mod perf;
 pub mod preprocessor;
 pub mod protocols;
 pub mod recorder;

--- a/lib/llm/src/perf.rs
+++ b/lib/llm/src/perf.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Performance recording and analysis for streaming LLM responses
+//!
+//! This module provides mechanisms to record streaming responses with minimal overhead
+//! during collection, then analyze the recorded data for performance insights.
+
+use futures::Stream;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use std::time::{Duration, Instant};
+use tokio::sync::oneshot;
+
+// Import the runtime types we need
+use dynamo_runtime::engine::{
+    AsyncEngineContext, AsyncEngineContextProvider, AsyncEngineStream, Data, DataStream,
+    EngineStream, ResponseStream,
+};
+use std::sync::Arc;
+
+/// Type alias for a receiver of recorded stream data
+pub type RecordedStreamReceiver<R> = oneshot::Receiver<RecordedStream<R>>;
+
+/// Type alias for the return type of recording functions
+pub type RecordingResult<R> = (EngineStream<R>, RecordedStreamReceiver<R>);
+
+/// A response wrapper that adds timing information with minimal overhead
+#[derive(Debug, Clone)]
+pub struct TimestampedResponse<T> {
+    /// The actual response data
+    pub response: T,
+    /// High-resolution timestamp when this response was recorded
+    pub timestamp: Instant,
+    /// Sequence number in the stream (0-based)
+    pub sequence_number: usize,
+}
+
+impl<T> TimestampedResponse<T> {
+    /// Create a new timestamped response
+    pub fn new(response: T, sequence_number: usize) -> Self {
+        Self {
+            response,
+            timestamp: Instant::now(),
+            sequence_number,
+        }
+    }
+
+    /// Get the response data
+    pub fn data(&self) -> &T {
+        &self.response
+    }
+
+    /// Get the elapsed time since stream start
+    pub fn elapsed_since(&self, start_time: Instant) -> Duration {
+        self.timestamp.duration_since(start_time)
+    }
+}
+
+/// Trait for requests that can provide hints about expected response count
+/// This enables capacity pre-allocation for better performance
+pub trait CapacityHint {
+    /// Estimate the number of responses this request might generate
+    /// Returns None if estimation is not possible
+    fn estimated_response_count(&self) -> Option<usize>;
+}
+
+/// Recording mode determines how the recorder behaves with the stream
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum RecordingMode {
+    /// Pass responses through while recording (scan mode)
+    /// Stream continues to flow to downstream consumers
+    Scan,
+    /// Consume responses as terminus (sink mode)
+    /// Stream ends at the recorder
+    Sink,
+}
+
+/// Container for recorded streaming responses.
+/// This forms the core object on which analysis is performed.
+#[derive(Debug, Clone)]
+pub struct RecordedStream<T> {
+    /// All recorded responses with timestamps
+    responses: Vec<TimestampedResponse<T>>,
+
+    /// When recording started
+    start_time: Instant,
+
+    /// When recording ended
+    end_time: Instant,
+}
+
+impl<T> RecordedStream<T> {
+    /// Create a new recorded stream from collected responses
+    pub fn new(
+        responses: Vec<TimestampedResponse<T>>,
+        start_time: Instant,
+        end_time: Instant,
+    ) -> Self {
+        Self {
+            responses,
+            start_time,
+            end_time,
+        }
+    }
+
+    /// Get the number of responses recorded
+    pub fn response_count(&self) -> usize {
+        self.responses.len()
+    }
+
+    /// Get the total duration of the stream
+    pub fn total_duration(&self) -> Duration {
+        self.end_time.duration_since(self.start_time)
+    }
+
+    /// Get the responses recorded
+    pub fn responses(&self) -> &[TimestampedResponse<T>] {
+        &self.responses
+    }
+
+    /// Get the start time of the stream
+    pub fn start_time(&self) -> &Instant {
+        &self.start_time
+    }
+
+    /// Get the end time of the stream
+    pub fn end_time(&self) -> &Instant {
+        &self.end_time
+    }
+}
+
+/// Recording stream that wraps an AsyncEngineStream and records responses
+/// Following the pattern of ResponseStream for AsyncEngine compatibility
+pub struct RecordingStream<R: Data> {
+    /// The wrapped stream
+    stream: DataStream<R>,
+    /// Context from the original stream
+    ctx: Arc<dyn AsyncEngineContext>,
+    /// Recording mode
+    mode: RecordingMode,
+    /// Recorded responses
+    responses: Vec<TimestampedResponse<R>>,
+    /// When recording started
+    start_time: Instant,
+    /// Channel to send recorded data when stream completes
+    recorded_tx: Option<oneshot::Sender<RecordedStream<R>>>,
+}
+
+impl<R: Data> Unpin for RecordingStream<R> {}
+
+impl<R: Data + Clone> RecordingStream<R> {
+    /// Create a new recording stream from a raw stream and context
+    pub fn from_stream_and_context(
+        stream: DataStream<R>,
+        ctx: Arc<dyn AsyncEngineContext>,
+        mode: RecordingMode,
+        capacity: Option<usize>,
+        recorded_tx: oneshot::Sender<RecordedStream<R>>,
+    ) -> Self {
+        let mut responses = Vec::new();
+        if let Some(cap) = capacity {
+            responses.reserve(cap);
+        }
+
+        Self {
+            stream,
+            ctx,
+            mode,
+            responses,
+            start_time: Instant::now(),
+            recorded_tx: Some(recorded_tx),
+        }
+    }
+
+    /// Create a new recording stream from an AsyncEngineStream (private constructor)
+    fn from_async_engine_stream(
+        stream: EngineStream<R>,
+        mode: RecordingMode,
+        capacity: Option<usize>,
+        recorded_tx: oneshot::Sender<RecordedStream<R>>,
+    ) -> Self {
+        let ctx = stream.context();
+        Self::from_stream_and_context(stream, ctx, mode, capacity, recorded_tx)
+    }
+
+    /// Convert to Pin<Box<dyn AsyncEngineStream<R>>>
+    pub fn into_async_engine_stream(self) -> EngineStream<R> {
+        Box::pin(self)
+    }
+}
+
+impl<R: Data + Clone> Stream for RecordingStream<R> {
+    type Item = R;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.as_mut().get_mut();
+
+        match Pin::new(&mut this.stream).poll_next(cx) {
+            Poll::Ready(Some(item)) => {
+                // Always capture timestamp first (cheap operation)
+                let timestamp = Instant::now();
+                let sequence_number = this.responses.len();
+
+                match this.mode {
+                    RecordingMode::Scan => {
+                        // Clone for recording, pass original through
+                        let timestamped = TimestampedResponse {
+                            response: item.clone(),
+                            timestamp,
+                            sequence_number,
+                        };
+                        this.responses.push(timestamped);
+                        Poll::Ready(Some(item)) // Pass through original
+                    }
+                    RecordingMode::Sink => {
+                        // Move item directly into recording (no clone needed)
+                        let timestamped = TimestampedResponse {
+                            response: item, // Move, don't clone
+                            timestamp,
+                            sequence_number,
+                        };
+                        this.responses.push(timestamped);
+
+                        // Continue consuming but don't emit
+                        // self.poll_next(cx)
+                        cx.waker().wake_by_ref();
+                        Poll::Pending
+                    }
+                }
+            }
+            Poll::Ready(None) => {
+                // Stream ended - send recorded data
+                if let Some(tx) = this.recorded_tx.take() {
+                    let recorded = RecordedStream::new(
+                        std::mem::take(&mut this.responses),
+                        this.start_time,
+                        Instant::now(),
+                    );
+                    let _ = tx.send(recorded); // Ignore if receiver dropped
+                }
+
+                Poll::Ready(None)
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl<R: Data + Clone> AsyncEngineStream<R> for RecordingStream<R> {}
+
+impl<R: Data + Clone> AsyncEngineContextProvider for RecordingStream<R> {
+    fn context(&self) -> Arc<dyn AsyncEngineContext> {
+        self.ctx.clone()
+    }
+}
+
+impl<R: Data + Clone> std::fmt::Debug for RecordingStream<R> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RecordingStream")
+            .field("mode", &self.mode)
+            .field("responses_count", &self.responses.len())
+            .field("ctx", &self.ctx)
+            .finish()
+    }
+}
+
+/// Create a recording stream that wraps an AsyncEngineStream
+/// Returns a pinned stream and a receiver for the recorded data
+pub fn record_stream<R: Data + Clone>(
+    stream: EngineStream<R>,
+    mode: RecordingMode,
+) -> RecordingResult<R> {
+    let (tx, rx) = oneshot::channel();
+    let recording_stream = RecordingStream::from_async_engine_stream(stream, mode, None, tx);
+    let boxed_stream = Box::pin(recording_stream);
+    (boxed_stream, rx)
+}
+
+/// Create a recording stream from a raw stream and context
+/// Returns a pinned stream and a receiver for the recorded data
+pub fn record_stream_with_context<R: Data + Clone>(
+    stream: DataStream<R>,
+    ctx: Arc<dyn AsyncEngineContext>,
+    mode: RecordingMode,
+) -> RecordingResult<R> {
+    let (tx, rx) = oneshot::channel();
+    let recording_stream = RecordingStream::from_stream_and_context(stream, ctx, mode, None, tx);
+    let boxed_stream = Box::pin(recording_stream);
+    (boxed_stream, rx)
+}
+
+/// Create a recording stream with capacity hint
+pub fn record_stream_with_capacity<R: Data + Clone>(
+    stream: EngineStream<R>,
+    mode: RecordingMode,
+    capacity: usize,
+) -> RecordingResult<R> {
+    let (tx, rx) = oneshot::channel();
+    let recording_stream =
+        RecordingStream::from_async_engine_stream(stream, mode, Some(capacity), tx);
+    let boxed_stream = Box::pin(recording_stream);
+    (boxed_stream, rx)
+}
+
+/// Create a recording stream with capacity hint from request
+pub fn record_stream_with_request_hint<R: Data + Clone, Req: CapacityHint>(
+    stream: EngineStream<R>,
+    mode: RecordingMode,
+    request: &Req,
+) -> RecordingResult<R> {
+    let capacity = request.estimated_response_count();
+    match capacity {
+        Some(cap) => record_stream_with_capacity(stream, mode, cap),
+        None => record_stream(stream, mode),
+    }
+}
+
+/// Create a recording stream from a raw stream and context with capacity hint
+pub fn record_stream_with_context_and_capacity<R: Data + Clone>(
+    stream: DataStream<R>,
+    ctx: Arc<dyn AsyncEngineContext>,
+    mode: RecordingMode,
+    capacity: usize,
+) -> RecordingResult<R> {
+    let (tx, rx) = oneshot::channel();
+    let recording_stream =
+        RecordingStream::from_stream_and_context(stream, ctx, mode, Some(capacity), tx);
+    let boxed_stream = Box::pin(recording_stream);
+    (boxed_stream, rx)
+}
+
+/// Create a recording stream from ResponseStream (convenience wrapper)
+pub fn record_response_stream<R: Data + Clone>(
+    response_stream: Pin<Box<ResponseStream<R>>>,
+    mode: RecordingMode,
+) -> RecordingResult<R> {
+    record_stream(response_stream, mode)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use dynamo_runtime::engine::ResponseStream;
+    use futures::stream;
+    use std::time::Duration;
+
+    #[test]
+    fn test_timestamped_response_creation() {
+        let response = "test response";
+        let timestamped = TimestampedResponse::new(response, 0);
+
+        assert_eq!(timestamped.response, response);
+        assert_eq!(timestamped.sequence_number, 0);
+        assert_eq!(timestamped.data(), &response);
+    }
+
+    #[test]
+    fn test_recorded_stream_analysis() {
+        let start_time = Instant::now();
+
+        // Create mock responses with known timing
+        let responses = vec![
+            TimestampedResponse {
+                response: "response1",
+                timestamp: start_time,
+                sequence_number: 0,
+            },
+            TimestampedResponse {
+                response: "response2",
+                timestamp: start_time + Duration::from_millis(100),
+                sequence_number: 1,
+            },
+            TimestampedResponse {
+                response: "response3",
+                timestamp: start_time + Duration::from_millis(250),
+                sequence_number: 2,
+            },
+        ];
+
+        let end_time = start_time + Duration::from_millis(250);
+        let recorded = RecordedStream::new(responses, start_time, end_time);
+
+        assert_eq!(recorded.response_count(), 3);
+        assert_eq!(recorded.total_duration(), Duration::from_millis(250));
+    }
+
+    #[test]
+    fn test_performance_metrics_conversion() {
+        let start_time = Instant::now();
+        let responses = vec![
+            TimestampedResponse {
+                response: "test",
+                timestamp: start_time + Duration::from_millis(50),
+                sequence_number: 0,
+            },
+            TimestampedResponse {
+                response: "test",
+                timestamp: start_time + Duration::from_millis(150),
+                sequence_number: 1,
+            },
+        ];
+
+        let end_time = start_time + Duration::from_millis(150);
+        let recorded = RecordedStream::new(responses, start_time, end_time);
+
+        assert_eq!(recorded.response_count(), 2);
+        assert_eq!(recorded.total_duration(), Duration::from_millis(150));
+    }
+
+    #[tokio::test]
+    async fn test_recording_stream_scan_mode() {
+        use futures::StreamExt;
+
+        // Create a simple test stream
+        let test_data = vec!["token1", "token2", "token3"];
+        let base_stream = stream::iter(test_data.clone());
+
+        // Create a mock context for the stream
+        let ctx = Arc::new(MockContext::new());
+
+        // Record the stream in scan mode using the simplified API
+        let (recorded_stream, recording_rx) =
+            record_stream_with_context(Box::pin(base_stream), ctx, RecordingMode::Scan);
+
+        // Consume the stream normally (pass-through mode)
+        let collected_responses: Vec<_> = recorded_stream.collect().await;
+
+        // Verify the responses passed through unchanged
+        assert_eq!(collected_responses, test_data);
+
+        // Get the recorded data
+        let recorded = recording_rx.await.unwrap();
+        assert_eq!(recorded.response_count(), 3);
+        assert_eq!(recorded.responses[0].response, "token1");
+        assert_eq!(recorded.responses[1].response, "token2");
+        assert_eq!(recorded.responses[2].response, "token3");
+
+        // Verify timing was recorded
+        assert!(recorded.total_duration() > Duration::from_nanos(0));
+    }
+
+    #[tokio::test]
+    async fn test_recording_stream_sink_mode() {
+        use futures::StreamExt;
+
+        // Create a simple test stream
+        let test_data = vec!["token1", "token2", "token3"];
+        let base_stream = stream::iter(test_data.clone());
+
+        // Create a mock context for the stream
+        let ctx = Arc::new(MockContext::new());
+
+        // Record the stream in sink mode using the simplified API
+        let (recorded_stream, recording_rx) =
+            record_stream_with_context(Box::pin(base_stream), ctx, RecordingMode::Sink);
+
+        // In sink mode, the stream should complete without emitting items
+        let collected_responses: Vec<_> = recorded_stream.collect().await;
+        assert_eq!(collected_responses, Vec::<&str>::new());
+
+        // Get the recorded data - should contain all original items
+        let recorded = recording_rx.await.unwrap();
+        assert_eq!(recorded.response_count(), 3);
+        assert_eq!(recorded.responses[0].response, "token1");
+        assert_eq!(recorded.responses[1].response, "token2");
+        assert_eq!(recorded.responses[2].response, "token3");
+
+        // Verify timing was recorded
+        assert!(recorded.total_duration() > Duration::from_nanos(0));
+    }
+
+    #[tokio::test]
+    async fn test_recording_stream_from_response_stream() {
+        use futures::StreamExt;
+
+        // Create a simple test stream
+        let test_data = vec!["token1", "token2", "token3"];
+        let base_stream = stream::iter(test_data.clone());
+
+        // Create a ResponseStream (the traditional way)
+        let ctx = Arc::new(MockContext::new());
+        let response_stream = ResponseStream::new(Box::pin(base_stream), ctx);
+
+        // Use the convenience API for ResponseStream
+        let (recorded_stream, recording_rx) =
+            record_response_stream(response_stream, RecordingMode::Scan);
+
+        // Consume the stream normally (pass-through mode)
+        let collected_responses: Vec<_> = recorded_stream.collect().await;
+
+        // Verify the responses passed through unchanged
+        assert_eq!(collected_responses, test_data);
+
+        // Get the recorded data
+        let recorded = recording_rx.await.unwrap();
+        assert_eq!(recorded.response_count(), 3);
+        assert_eq!(recorded.responses[0].response, "token1");
+        assert_eq!(recorded.responses[1].response, "token2");
+        assert_eq!(recorded.responses[2].response, "token3");
+
+        // Verify timing was recorded
+        assert!(recorded.total_duration() > Duration::from_nanos(0));
+    }
+
+    // Mock context for testing
+    #[derive(Debug)]
+    struct MockContext {
+        id: String,
+    }
+
+    impl MockContext {
+        fn new() -> Self {
+            Self {
+                id: "test-context".to_string(),
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl AsyncEngineContext for MockContext {
+        fn id(&self) -> &str {
+            &self.id
+        }
+
+        fn stop(&self) {
+            // No-op for testing
+        }
+
+        fn stop_generating(&self) {
+            // No-op for testing
+        }
+
+        fn kill(&self) {
+            // No-op for testing
+        }
+
+        fn is_stopped(&self) -> bool {
+            false
+        }
+
+        fn is_killed(&self) -> bool {
+            false
+        }
+
+        async fn stopped(&self) {
+            // No-op for testing
+        }
+
+        async fn killed(&self) {
+            // No-op for testing
+        }
+    }
+}
--- a/lib/llm/tests/http-service.rs
+++ b/lib/llm/tests/http-service.rs
@@ -14,12 +14,18 @@
 // limitations under the License.

 use anyhow::Error;
+use async_openai::config::OpenAIConfig;
 use async_stream::stream;
-use dynamo_llm::http::service::{
-    error::HttpError,
-    metrics::{Endpoint, RequestType, Status},
-    service_v2::HttpService,
-    Metrics,
+use dynamo_llm::http::{
+    client::{
+        GenericBYOTClient, HttpClientConfig, HttpRequestContext, NvCustomClient, PureOpenAIClient,
+    },
+    service::{
+        error::HttpError,
+        metrics::{Endpoint, RequestType, Status},
+        service_v2::HttpService,
+        Metrics,
+    },
 };
 use dynamo_llm::protocols::{
    openai::{
@@ -29,13 +35,16 @@ use dynamo_llm::protocols::{
    Annotated,
 };
 use dynamo_runtime::{
+    engine::AsyncEngineContext,
    pipeline::{
        async_trait, AsyncEngine, AsyncEngineContextProvider, ManyOut, ResponseStream, SingleIn,
    },
    CancellationToken,
 };
+use futures::StreamExt;
 use prometheus::{proto::MetricType, Registry};
 use reqwest::StatusCode;
+use rstest::*;
 use std::sync::Arc;

 struct CounterEngine {}
@@ -470,3 +479,404 @@ async fn test_http_service() {
    cancel_token.cancel();
    task.await.unwrap().unwrap();
 }
+
+// === HTTP Client Tests ===
+
+/// Wait for the HTTP service to be ready by checking its health endpoint
+async fn wait_for_service_ready(port: u16) {
+    let start = tokio::time::Instant::now();
+    let timeout = tokio::time::Duration::from_secs(5);
+    loop {
+        match reqwest::get(&format!("http://localhost:{}/health", port)).await {
+            Ok(_) => break,
+            Err(_) if start.elapsed() < timeout => {
+                tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
+            }
+            Err(e) => panic!("Service failed to start within timeout: {}", e),
+        }
+    }
+}
+
+#[fixture]
+fn service_with_engines(
+    #[default(8990)] port: u16,
+) -> (HttpService, Arc<CounterEngine>, Arc<AlwaysFailEngine>) {
+    let service = HttpService::builder().port(port).build().unwrap();
+    let manager = service.model_manager();
+
+    let counter = Arc::new(CounterEngine {});
+    let failure = Arc::new(AlwaysFailEngine {});
+
+    manager
+        .add_chat_completions_model("foo", counter.clone())
+        .unwrap();
+    manager
+        .add_chat_completions_model("bar", failure.clone())
+        .unwrap();
+    manager
+        .add_completions_model("bar", failure.clone())
+        .unwrap();
+
+    (service, counter, failure)
+}
+
+#[fixture]
+fn pure_openai_client(#[default(8990)] port: u16) -> PureOpenAIClient {
+    let config = HttpClientConfig {
+        openai_config: OpenAIConfig::new().with_api_base(format!("http://localhost:{}/v1", port)),
+        verbose: false,
+    };
+    PureOpenAIClient::new(config)
+}
+
+#[fixture]
+fn nv_custom_client(#[default(8991)] port: u16) -> NvCustomClient {
+    let config = HttpClientConfig {
+        openai_config: OpenAIConfig::new().with_api_base(format!("http://localhost:{}/v1", port)),
+        verbose: false,
+    };
+    NvCustomClient::new(config)
+}
+
+#[fixture]
+fn generic_byot_client(#[default(8992)] port: u16) -> GenericBYOTClient {
+    let config = HttpClientConfig {
+        openai_config: OpenAIConfig::new().with_api_base(format!("http://localhost:{}/v1", port)),
+        verbose: false,
+    };
+    GenericBYOTClient::new(config)
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_pure_openai_client(
+    #[with(8990)] service_with_engines: (HttpService, Arc<CounterEngine>, Arc<AlwaysFailEngine>),
+    #[with(8990)] pure_openai_client: PureOpenAIClient,
+) {
+    let (service, _counter, _failure) = service_with_engines;
+    let token = CancellationToken::new();
+    let cancel_token = token.clone();
+
+    // Start the service
+    let task = tokio::spawn(async move { service.run(token).await });
+
+    // Wait for service to be ready
+    wait_for_service_ready(8990).await;
+
+    // Test successful streaming request
+    let request = async_openai::types::CreateChatCompletionRequestArgs::default()
+        .model("foo")
+        .messages(vec![
+            async_openai::types::ChatCompletionRequestMessage::User(
+                async_openai::types::ChatCompletionRequestUserMessage {
+                    content: async_openai::types::ChatCompletionRequestUserMessageContent::Text(
+                        "Hi".to_string(),
+                    ),
+                    name: None,
+                },
+            ),
+        ])
+        .stream(true)
+        .max_tokens(50u32)
+        .build()
+        .unwrap();
+
+    let result = pure_openai_client.chat_stream(request).await;
+    assert!(result.is_ok(), "PureOpenAI client should succeed");
+
+    let (mut stream, _context) = result.unwrap().dissolve();
+    let mut count = 0;
+    while let Some(response) = stream.next().await {
+        count += 1;
+        assert!(response.is_ok(), "Response should be ok");
+        if count >= 3 {
+            break; // Don't consume entire stream
+        }
+    }
+    assert!(count > 0, "Should receive at least one response");
+
+    // Test error case with invalid model
+    let request = async_openai::types::CreateChatCompletionRequestArgs::default()
+        .model("bar") // This model will fail
+        .messages(vec![
+            async_openai::types::ChatCompletionRequestMessage::User(
+                async_openai::types::ChatCompletionRequestUserMessage {
+                    content: async_openai::types::ChatCompletionRequestUserMessageContent::Text(
+                        "Hi".to_string(),
+                    ),
+                    name: None,
+                },
+            ),
+        ])
+        .stream(true)
+        .max_tokens(50u32)
+        .build()
+        .unwrap();
+
+    let result = pure_openai_client.chat_stream(request).await;
+    assert!(
+        result.is_ok(),
+        "Client should return stream even for failing model"
+    );
+
+    let (mut stream, _context) = result.unwrap().dissolve();
+    if let Some(response) = stream.next().await {
+        assert!(
+            response.is_err(),
+            "Response should be error for failing model"
+        );
+    }
+
+    // Test context management
+    let ctx = HttpRequestContext::new();
+    let request = async_openai::types::CreateChatCompletionRequestArgs::default()
+        .model("foo")
+        .messages(vec![
+            async_openai::types::ChatCompletionRequestMessage::User(
+                async_openai::types::ChatCompletionRequestUserMessage {
+                    content: async_openai::types::ChatCompletionRequestUserMessageContent::Text(
+                        "Hi".to_string(),
+                    ),
+                    name: None,
+                },
+            ),
+        ])
+        .stream(true)
+        .max_tokens(50u32)
+        .build()
+        .unwrap();
+
+    let result = pure_openai_client
+        .chat_stream_with_context(request, ctx.clone())
+        .await;
+    assert!(result.is_ok(), "Context-based request should succeed");
+
+    let (_stream, context) = result.unwrap().dissolve();
+    assert_eq!(context.id(), ctx.id(), "Context ID should match");
+
+    cancel_token.cancel();
+    task.await.unwrap().unwrap();
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_nv_custom_client(
+    #[with(8991)] service_with_engines: (HttpService, Arc<CounterEngine>, Arc<AlwaysFailEngine>),
+    #[with(8991)] nv_custom_client: NvCustomClient,
+) {
+    let (service, _counter, _failure) = service_with_engines;
+    let token = CancellationToken::new();
+    let cancel_token = token.clone();
+
+    // Start the service
+    let task = tokio::spawn(async move { service.run(token).await });
+
+    // Wait for service to be ready
+    wait_for_service_ready(8991).await;
+
+    // Test successful streaming request
+    let inner_request = async_openai::types::CreateChatCompletionRequestArgs::default()
+        .model("foo")
+        .messages(vec![
+            async_openai::types::ChatCompletionRequestMessage::User(
+                async_openai::types::ChatCompletionRequestUserMessage {
+                    content: async_openai::types::ChatCompletionRequestUserMessageContent::Text(
+                        "Hi".to_string(),
+                    ),
+                    name: None,
+                },
+            ),
+        ])
+        .stream(true)
+        .max_tokens(50u32)
+        .build()
+        .unwrap();
+
+    let request = NvCreateChatCompletionRequest {
+        inner: inner_request,
+        nvext: None,
+    };
+
+    let result = nv_custom_client.chat_stream(request).await;
+    assert!(result.is_ok(), "NvCustom client should succeed");
+
+    let (mut stream, _context) = result.unwrap().dissolve();
+    let mut count = 0;
+    while let Some(response) = stream.next().await {
+        count += 1;
+        assert!(response.is_ok(), "Response should be ok");
+        if count >= 3 {
+            break; // Don't consume entire stream
+        }
+    }
+    assert!(count > 0, "Should receive at least one response");
+
+    // Test error case with invalid model
+    let inner_request = async_openai::types::CreateChatCompletionRequestArgs::default()
+        .model("bar") // This model will fail
+        .messages(vec![
+            async_openai::types::ChatCompletionRequestMessage::User(
+                async_openai::types::ChatCompletionRequestUserMessage {
+                    content: async_openai::types::ChatCompletionRequestUserMessageContent::Text(
+                        "Hi".to_string(),
+                    ),
+                    name: None,
+                },
+            ),
+        ])
+        .stream(true)
+        .max_tokens(50u32)
+        .build()
+        .unwrap();
+
+    let request = NvCreateChatCompletionRequest {
+        inner: inner_request,
+        nvext: None,
+    };
+
+    let result = nv_custom_client.chat_stream(request).await;
+    assert!(
+        result.is_ok(),
+        "Client should return stream even for failing model"
+    );
+
+    let (mut stream, _context) = result.unwrap().dissolve();
+    if let Some(response) = stream.next().await {
+        assert!(
+            response.is_err(),
+            "Response should be error for failing model"
+        );
+    }
+
+    // Test context management
+    let ctx = HttpRequestContext::new();
+    let inner_request = async_openai::types::CreateChatCompletionRequestArgs::default()
+        .model("foo")
+        .messages(vec![
+            async_openai::types::ChatCompletionRequestMessage::User(
+                async_openai::types::ChatCompletionRequestUserMessage {
+                    content: async_openai::types::ChatCompletionRequestUserMessageContent::Text(
+                        "Hi".to_string(),
+                    ),
+                    name: None,
+                },
+            ),
+        ])
+        .stream(true)
+        .max_tokens(50u32)
+        .build()
+        .unwrap();
+
+    let request = NvCreateChatCompletionRequest {
+        inner: inner_request,
+        nvext: None,
+    };
+
+    let result = nv_custom_client
+        .chat_stream_with_context(request, ctx.clone())
+        .await;
+    assert!(result.is_ok(), "Context-based request should succeed");
+
+    let (_stream, context) = result.unwrap().dissolve();
+    assert_eq!(context.id(), ctx.id(), "Context ID should match");
+
+    cancel_token.cancel();
+    task.await.unwrap().unwrap();
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_generic_byot_client(
+    #[with(8992)] service_with_engines: (HttpService, Arc<CounterEngine>, Arc<AlwaysFailEngine>),
+    #[with(8992)] generic_byot_client: GenericBYOTClient,
+) {
+    let (service, _counter, _failure) = service_with_engines;
+    let token = CancellationToken::new();
+    let cancel_token = token.clone();
+
+    // Start the service
+    let task = tokio::spawn(async move { service.run(token).await });
+
+    // Wait for service to be ready
+    wait_for_service_ready(8992).await;
+
+    // Test successful streaming request
+    let request = serde_json::json!({
+        "model": "foo",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hi"
+            }
+        ],
+        "stream": true,
+        "max_tokens": 50
+    });
+
+    let result = generic_byot_client.chat_stream(request).await;
+    assert!(result.is_ok(), "GenericBYOT client should succeed");
+
+    let (mut stream, _context) = result.unwrap().dissolve();
+    let mut count = 0;
+    while let Some(response) = stream.next().await {
+        println!("Response: {:?}", response);
+        count += 1;
+        assert!(response.is_ok(), "Response should be ok");
+        if count >= 3 {
+            break; // Don't consume entire stream
+        }
+    }
+    assert!(count > 0, "Should receive at least one response");
+
+    // Test error case with invalid model
+    let request = serde_json::json!({
+        "model": "bar", // This model will fail
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hi"
+            }
+        ],
+        "stream": true,
+        "max_tokens": 50
+    });
+
+    let result = generic_byot_client.chat_stream(request).await;
+    assert!(
+        result.is_ok(),
+        "Client should return stream even for failing model"
+    );
+
+    let (mut stream, _context) = result.unwrap().dissolve();
+    if let Some(response) = stream.next().await {
+        assert!(
+            response.is_err(),
+            "Response should be error for failing model"
+        );
+    }
+
+    // Test context management
+    let ctx = HttpRequestContext::new();
+    let request = serde_json::json!({
+        "model": "foo",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hi"
+            }
+        ],
+        "stream": true,
+        "max_tokens": 50
+    });
+
+    let result = generic_byot_client
+        .chat_stream_with_context(request, ctx.clone())
+        .await;
+    assert!(result.is_ok(), "Context-based request should succeed");
+
+    let (_stream, context) = result.unwrap().dissolve();
+    assert_eq!(context.id(), ctx.id(), "Context ID should match");
+
+    cancel_token.cancel();
+    task.await.unwrap().unwrap();
+}
--- a/lib/runtime/src/engine.rs
+++ b/lib/runtime/src/engine.rs
@@ -165,7 +165,7 @@ pub trait AsyncEngineContext: Send + Sync + Debug {
 ///
 /// This trait is implemented by both unary and streaming engine results, allowing
 /// uniform access to context information regardless of the operation type.
-pub trait AsyncEngineContextProvider: Send + Sync + Debug {
+pub trait AsyncEngineContextProvider: Send + Debug {
    fn context(&self) -> Arc<dyn AsyncEngineContext>;
 }