Unverified Commit a2a2753d authored by atchernych's avatar atchernych Committed by GitHub
Browse files

fix: Remove double tokenization in EPP Integration fixes [DYN-2076] (#8093)


Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
parent 4d5db80a
...@@ -564,7 +564,13 @@ impl OpenAIPreprocessor { ...@@ -564,7 +564,13 @@ impl OpenAIPreprocessor {
// Completions will use raw_prompt, no template // Completions will use raw_prompt, no template
let prompt = formatted_prompt.unwrap_or(raw_prompt); let prompt = formatted_prompt.unwrap_or(raw_prompt);
// Check if backend_instance_id is present and token_data is provided // If nvext.token_data is present, use the pre-computed tokens
// directly and skip tokenization. This avoids redundant
// tokenization when an external component (e.g. the GAIE EPP
// KV-router) has already tokenized the prompt.
// When backend_instance_id is set without token_data, warn
// but fall back to tokenization (backward compat for non-GAIE
// routers that set the header without providing tokens).
let has_backend_instance_id = request let has_backend_instance_id = request
.nvext() .nvext()
.and_then(|ext| ext.backend_instance_id) .and_then(|ext| ext.backend_instance_id)
...@@ -573,23 +579,22 @@ impl OpenAIPreprocessor { ...@@ -573,23 +579,22 @@ impl OpenAIPreprocessor {
let token_data = let token_data =
request.nvext().and_then(|ext| ext.token_data.as_ref()); request.nvext().and_then(|ext| ext.token_data.as_ref());
let (tokens_vec, skip_token_annotation) = if has_backend_instance_id { let (tokens_vec, skip_token_annotation) = if let Some(tokens) =
if let Some(tokens) = token_data { token_data
tracing::trace!( {
"Using provided tokens from EPP: {} ids", tracing::info!(
tokens.len() token_count = tokens.len(),
); first_tokens = ?&tokens[..std::cmp::min(5, tokens.len())],
// need ownership for the builder, so clone. "[SIDECAR-SKIP-TOKENIZE] Found nvext.token_data — using pre-computed tokens, SKIPPING tokenization"
(tokens.clone(), true) );
} else { (tokens.clone(), true)
tracing::warn!( } else if has_backend_instance_id {
"backend_instance_id provided but no token_data; tokenizing prompt" tracing::warn!(
); "backend_instance_id provided but no token_data; tokenizing prompt"
let encoding = self.encode_with_timing(&prompt, tracker)?; );
(encoding.token_ids().to_vec(), false) let encoding = self.encode_with_timing(&prompt, tracker)?;
} (encoding.token_ids().to_vec(), false)
} else { } else {
// No backend_instance_id provided, continue the normal flow.
let encoding = self.encode_with_timing(&prompt, tracker)?; let encoding = self.encode_with_timing(&prompt, tracker)?;
(encoding.token_ids().to_vec(), false) (encoding.token_ids().to_vec(), false)
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment