chore(dynamo-run): Fix echo_core for EOS tokens (#759)

"echo_core" is an engine that echoes the post-processed request back to you so you can see the template. Good for testing. It needed an extra flag set to work correctly.

chore(dynamo-run): Fix echo_core for EOS tokens (#759)
"echo_core" is an engine that echoes the post-processed request back to you so you can see the template. Good for testing. It needed an extra flag set to work correctly.
4e75b04b · Graham King · GitHub · ee865ca0 · 4e75b04b · 4e75b04b
Unverified Commit 4e75b04b authored Apr 21, 2025 by Graham King Committed by GitHub Apr 21, 2025
Showing with 22 additions and 3 deletions

docs/guides/dynamo_run.md docs/guides/dynamo_run.md +7 -0

launch/dynamo-run/src/input/text.rs launch/dynamo-run/src/input/text.rs +9 -1

lib/llm/src/http/service/openai.rs lib/llm/src/http/service/openai.rs +6 -2

No files found.
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -398,6 +398,13 @@ The `echo_core` engine accepts pre-processed requests and echoes the tokens back
 dynamo-run in=http out=echo_core --model-path <hf-repo-checkout>
 ```
+Note that to use it with `in=http` you need to tell the post processor to ignore stop tokens from the template by adding `nvext.ignore_eos` like this:
+```
+curl -N -d '{"nvext": {"ignore_eos": true}, "stream": true, "model": "Qwen2.5-3B-Instruct", "max_completion_tokens": 4096, "messages":[{"role":"user", "content": "Tell me a story" }]}' ...
+```
+The default `in=text` sets that for you.
 #### echo_full
 The `echo_full` engine accepts un-processed requests and echoes the prompt back as the response.

--- a/launch/dynamo-run/src/input/text.rs
+++ b/launch/dynamo-run/src/input/text.rs
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+use dynamo_llm::protocols::openai::nvext::NvExt;
 use dynamo_llm::types::openai::chat_completions::{
    NvCreateChatCompletionRequest, OpenAIChatCompletionsStreamingEngine,
 };
@@ -110,6 +111,10 @@ async fn main_loop(
            .temperature(0.7)
            .n(1) // only generate one response
            .build()?;
+        let nvext = NvExt {
+            ignore_eos: Some(true),
+            ..Default::default()
+        };
        // TODO We cannot set min_tokens with async-openai
        // if inspect_template {
@@ -117,7 +122,10 @@ async fn main_loop(
        //     req_builder.min_tokens(8192);
        // }
-        let req = NvCreateChatCompletionRequest { inner, nvext: None };
+        let req = NvCreateChatCompletionRequest {
+            inner,
+            nvext: Some(nvext),
+        };
        // Call the model
        let mut stream = engine.generate(Context::new(req)).await?;

--- a/lib/llm/src/http/service/openai.rs
+++ b/lib/llm/src/http/service/openai.rs
@@ -148,7 +148,10 @@ async fn completions(
        ..request.inner
    };
-    let request = CompletionRequest { inner, nvext: None };
+    let request = CompletionRequest {
+        inner,
+        nvext: request.nvext,
+    };
    // todo - make the protocols be optional for model name
    // todo - when optional, if none, apply a default
@@ -233,9 +236,10 @@ async fn chat_completions(
        stream: Some(true),
        ..request.inner
    };
    let request = NvCreateChatCompletionRequest {
        inner: inner_request,
-        nvext: None,
+        nvext: request.nvext,
    };
    // todo - make the protocols be optional for model name