feat: Add configurable DYN_TOKEN_ECHO_DELAY_MS for echo engine testing (#81)

0a3f2c69 · Ryan McCormick · GitHub · 11e3e188 · 0a3f2c69 · 0a3f2c69
Commit 0a3f2c69 authored Mar 10, 2025 by Ryan McCormick Committed by GitHub Mar 10, 2025
5 changed files
--- a/launch/dynamo-run/README.md
+++ b/launch/dynamo-run/README.md
@@ -278,3 +278,34 @@ sudo docker run --gpus all -it -v /home/user:/outside-home gitlab-master.nvidia.
 ```

 Copy the trt-llm engine, the model's `.json` files (for the model deployment card) and the `nio` binary built for the correct glibc (container is Ubuntu 22.04 currently) into that container.
+
+## Echo Engines
+
+Dynamo includes two echo engines for testing and debugging purposes:
+
+### echo_core
+
+The `echo_core` engine accepts pre-processed requests and echoes the tokens back as the response. This is useful for testing pre-processing functionality as the response will include the full prompt template.
+
+```
+dynamo-run in=http out=echo_core --model-path <hf-repo-checkout>
+```
+
+### echo_full
+
+The `echo_full` engine accepts un-processed requests and echoes the prompt back as the response.
+
+```
+dynamo-run in=http out=echo_full
+```
+
+### Configuration
+
+Both echo engines use a configurable delay between tokens to simulate generation speed. You can adjust this using the `DYN_TOKEN_ECHO_DELAY_MS` environment variable:
+
+```
+# Set token echo delay to 1ms (1000 tokens per second)
+DYN_TOKEN_ECHO_DELAY_MS=1 dynamo-run in=http out=echo_full
+```
+
+The default delay is 10ms, which produces approximately 100 tokens per second.
--- a/launch/dynamo-run/src/output.rs
+++ b/launch/dynamo-run/src/output.rs
@@ -13,5 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+pub mod common;
 pub mod echo_core;
 pub mod echo_full;
--- a/launch/dynamo-run/src/output/common.rs
+++ b/launch/dynamo-run/src/output/common.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::env;
+use std::sync::LazyLock;
+use std::time::Duration;
+
+/// How long to sleep between echoed tokens.
+/// Default is 10ms which gives us 100 tok/s.
+/// Can be configured via the DYN_TOKEN_ECHO_DELAY_MS environment variable.
+pub static TOKEN_ECHO_DELAY: LazyLock<Duration> = LazyLock::new(|| {
+    const DEFAULT_DELAY_MS: u64 = 10;
+
+    let delay_ms = env::var("DYN_TOKEN_ECHO_DELAY_MS")
+        .ok()
+        .and_then(|val| val.parse::<u64>().ok())
+        .unwrap_or(DEFAULT_DELAY_MS);
+
+    Duration::from_millis(delay_ms)
+});
--- a/launch/dynamo-run/src/output/echo_core.rs
+++ b/launch/dynamo-run/src/output/echo_core.rs
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::{sync::Arc, time::Duration};
+use std::sync::Arc;

 use async_stream::stream;
 use async_trait::async_trait;
@@ -25,9 +25,7 @@ use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseSt
 use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
 use dynamo_runtime::protocols::annotated::Annotated;

-/// How long to sleep between echoed tokens.
-/// 50ms gives us 20 tok/s.
-const TOKEN_ECHO_DELAY: Duration = Duration::from_millis(50);
+use super::common::TOKEN_ECHO_DELAY;

 /// Engine that accepts pre-processed requests and echos the tokens back as the response
 /// The response will include the full prompt template.
@@ -50,7 +48,7 @@ impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Er

        let output = stream! {
            for tok in request.token_ids {
-                tokio::time::sleep(TOKEN_ECHO_DELAY).await;
+                tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
                yield delta_core(tok);
            }
            yield Annotated::from_data(LLMEngineOutput::stop());

--- a/launch/dynamo-run/src/output/echo_full.rs
+++ b/launch/dynamo-run/src/output/echo_full.rs
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::{sync::Arc, time::Duration};
+use std::sync::Arc;

 use async_stream::stream;
 use async_trait::async_trait;
@@ -26,9 +26,7 @@ use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseSt
 use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
 use dynamo_runtime::protocols::annotated::Annotated;

-/// How long to sleep between echoed tokens.
-/// 50ms gives us 20 tok/s.
-const TOKEN_ECHO_DELAY: Duration = Duration::from_millis(50);
+use super::common::TOKEN_ECHO_DELAY;

 /// Engine that accepts un-preprocessed requests and echos the prompt back as the response
 /// Useful for testing ingress such as service-http.
@@ -69,8 +67,8 @@ impl
        let output = stream! {
            let mut id = 1;
            for c in prompt.chars() {
-                // we are returning characters not tokens, so speed up some
-                tokio::time::sleep(TOKEN_ECHO_DELAY/2).await;
+                // we are returning characters not tokens, so there will be some postprocessing overhead
+                tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
                let inner = deltas.create_choice(0, Some(c.to_string()), None, None);
                let response = NvCreateChatCompletionStreamResponse {
                    inner,