Unverified Commit 4e75b04b authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore(dynamo-run): Fix echo_core for EOS tokens (#759)

"echo_core" is an engine that echoes the post-processed request back to you so you can see the template. Good for testing. It needed an extra flag set to work correctly.
parent ee865ca0
...@@ -398,6 +398,13 @@ The `echo_core` engine accepts pre-processed requests and echoes the tokens back ...@@ -398,6 +398,13 @@ The `echo_core` engine accepts pre-processed requests and echoes the tokens back
dynamo-run in=http out=echo_core --model-path <hf-repo-checkout> dynamo-run in=http out=echo_core --model-path <hf-repo-checkout>
``` ```
Note that to use it with `in=http` you need to tell the post processor to ignore stop tokens from the template by adding `nvext.ignore_eos` like this:
```
curl -N -d '{"nvext": {"ignore_eos": true}, "stream": true, "model": "Qwen2.5-3B-Instruct", "max_completion_tokens": 4096, "messages":[{"role":"user", "content": "Tell me a story" }]}' ...
```
The default `in=text` sets that for you.
#### echo_full #### echo_full
The `echo_full` engine accepts un-processed requests and echoes the prompt back as the response. The `echo_full` engine accepts un-processed requests and echoes the prompt back as the response.
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use dynamo_llm::protocols::openai::nvext::NvExt;
use dynamo_llm::types::openai::chat_completions::{ use dynamo_llm::types::openai::chat_completions::{
NvCreateChatCompletionRequest, OpenAIChatCompletionsStreamingEngine, NvCreateChatCompletionRequest, OpenAIChatCompletionsStreamingEngine,
}; };
...@@ -110,6 +111,10 @@ async fn main_loop( ...@@ -110,6 +111,10 @@ async fn main_loop(
.temperature(0.7) .temperature(0.7)
.n(1) // only generate one response .n(1) // only generate one response
.build()?; .build()?;
let nvext = NvExt {
ignore_eos: Some(true),
..Default::default()
};
// TODO We cannot set min_tokens with async-openai // TODO We cannot set min_tokens with async-openai
// if inspect_template { // if inspect_template {
...@@ -117,7 +122,10 @@ async fn main_loop( ...@@ -117,7 +122,10 @@ async fn main_loop(
// req_builder.min_tokens(8192); // req_builder.min_tokens(8192);
// } // }
let req = NvCreateChatCompletionRequest { inner, nvext: None }; let req = NvCreateChatCompletionRequest {
inner,
nvext: Some(nvext),
};
// Call the model // Call the model
let mut stream = engine.generate(Context::new(req)).await?; let mut stream = engine.generate(Context::new(req)).await?;
......
...@@ -148,7 +148,10 @@ async fn completions( ...@@ -148,7 +148,10 @@ async fn completions(
..request.inner ..request.inner
}; };
let request = CompletionRequest { inner, nvext: None }; let request = CompletionRequest {
inner,
nvext: request.nvext,
};
// todo - make the protocols be optional for model name // todo - make the protocols be optional for model name
// todo - when optional, if none, apply a default // todo - when optional, if none, apply a default
...@@ -233,9 +236,10 @@ async fn chat_completions( ...@@ -233,9 +236,10 @@ async fn chat_completions(
stream: Some(true), stream: Some(true),
..request.inner ..request.inner
}; };
let request = NvCreateChatCompletionRequest { let request = NvCreateChatCompletionRequest {
inner: inner_request, inner: inner_request,
nvext: None, nvext: request.nvext,
}; };
// todo - make the protocols be optional for model name // todo - make the protocols be optional for model name
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment