Commit 0a3f2c69 authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

feat: Add configurable DYN_TOKEN_ECHO_DELAY_MS for echo engine testing (#81)

parent 11e3e188
......@@ -278,3 +278,34 @@ sudo docker run --gpus all -it -v /home/user:/outside-home gitlab-master.nvidia.
```
Copy the trt-llm engine, the model's `.json` files (for the model deployment card) and the `nio` binary built for the correct glibc (container is Ubuntu 22.04 currently) into that container.
## Echo Engines
Dynamo includes two echo engines for testing and debugging purposes:
### echo_core
The `echo_core` engine accepts pre-processed requests and echoes the tokens back as the response. This is useful for testing pre-processing functionality as the response will include the full prompt template.
```
dynamo-run in=http out=echo_core --model-path <hf-repo-checkout>
```
### echo_full
The `echo_full` engine accepts un-processed requests and echoes the prompt back as the response.
```
dynamo-run in=http out=echo_full
```
### Configuration
Both echo engines use a configurable delay between tokens to simulate generation speed. You can adjust this using the `DYN_TOKEN_ECHO_DELAY_MS` environment variable:
```
# Set token echo delay to 1ms (1000 tokens per second)
DYN_TOKEN_ECHO_DELAY_MS=1 dynamo-run in=http out=echo_full
```
The default delay is 10ms, which produces approximately 100 tokens per second.
......@@ -13,5 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod common;
pub mod echo_core;
pub mod echo_full;
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::env;
use std::sync::LazyLock;
use std::time::Duration;
/// How long to sleep between echoed tokens.
/// Default is 10ms which gives us 100 tok/s.
/// Can be configured via the DYN_TOKEN_ECHO_DELAY_MS environment variable.
pub static TOKEN_ECHO_DELAY: LazyLock<Duration> = LazyLock::new(|| {
const DEFAULT_DELAY_MS: u64 = 10;
let delay_ms = env::var("DYN_TOKEN_ECHO_DELAY_MS")
.ok()
.and_then(|val| val.parse::<u64>().ok())
.unwrap_or(DEFAULT_DELAY_MS);
Duration::from_millis(delay_ms)
});
......@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::{sync::Arc, time::Duration};
use std::sync::Arc;
use async_stream::stream;
use async_trait::async_trait;
......@@ -25,9 +25,7 @@ use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseSt
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
/// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s.
const TOKEN_ECHO_DELAY: Duration = Duration::from_millis(50);
use super::common::TOKEN_ECHO_DELAY;
/// Engine that accepts pre-processed requests and echos the tokens back as the response
/// The response will include the full prompt template.
......@@ -50,7 +48,7 @@ impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Er
let output = stream! {
for tok in request.token_ids {
tokio::time::sleep(TOKEN_ECHO_DELAY).await;
tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
yield delta_core(tok);
}
yield Annotated::from_data(LLMEngineOutput::stop());
......
......@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::{sync::Arc, time::Duration};
use std::sync::Arc;
use async_stream::stream;
use async_trait::async_trait;
......@@ -26,9 +26,7 @@ use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseSt
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
/// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s.
const TOKEN_ECHO_DELAY: Duration = Duration::from_millis(50);
use super::common::TOKEN_ECHO_DELAY;
/// Engine that accepts un-preprocessed requests and echos the prompt back as the response
/// Useful for testing ingress such as service-http.
......@@ -69,8 +67,8 @@ impl
let output = stream! {
let mut id = 1;
for c in prompt.chars() {
// we are returning characters not tokens, so speed up some
tokio::time::sleep(TOKEN_ECHO_DELAY/2).await;
// we are returning characters not tokens, so there will be some postprocessing overhead
tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
let inner = deltas.create_choice(0, Some(c.to_string()), None, None);
let response = NvCreateChatCompletionStreamResponse {
inner,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment