fix: Remove double tokenization in EPP Integration fixes [DYN-2076] (#8093)

Signed-off-by: Anna Tchernych <atchernych@nvidia.com>

fix: Remove double tokenization in EPP Integration fixes [DYN-2076] (#8093)
Signed-off-by: Anna Tchernych <atchernych@nvidia.com>
a2a2753d · atchernych · GitHub · 4d5db80a · a2a2753d
Unverified Commit a2a2753d authored Apr 22, 2026 by atchernych Committed by GitHub Apr 22, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 17 deletions

lib/llm/src/preprocessor.rs lib/llm/src/preprocessor.rs +22 -17

No files found.
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -564,7 +564,13 @@ impl OpenAIPreprocessor {
                            // Completions will use raw_prompt, no template
                            let prompt = formatted_prompt.unwrap_or(raw_prompt);
-                            // Check if backend_instance_id is present and token_data is provided
+                            // If nvext.token_data is present, use the pre-computed tokens
+                            // directly and skip tokenization.  This avoids redundant
+                            // tokenization when an external component (e.g. the GAIE EPP
+                            // KV-router) has already tokenized the prompt.
+                            // When backend_instance_id is set without token_data, warn
+                            // but fall back to tokenization (backward compat for non-GAIE
+                            // routers that set the header without providing tokens).
                            let has_backend_instance_id = request
                                .nvext()
                                .and_then(|ext| ext.backend_instance_id)
@@ -573,23 +579,22 @@ impl OpenAIPreprocessor {
                            let token_data =
                                request.nvext().and_then(|ext| ext.token_data.as_ref());
-                            let (tokens_vec, skip_token_annotation) = if has_backend_instance_id {
+                            let (tokens_vec, skip_token_annotation) = if let Some(tokens) =
-                                if let Some(tokens) = token_data {
+                                token_data
-                                    tracing::trace!(
+                            {
-                                        "Using provided tokens from EPP: {} ids",
+                                tracing::info!(
-                                        tokens.len()
+                                    token_count = tokens.len(),
-                                    );
+                                    first_tokens = ?&tokens[..std::cmp::min(5, tokens.len())],
-                                    // need ownership for the builder, so clone.
+                                    "[SIDECAR-SKIP-TOKENIZE] Found nvext.token_data — using pre-computed tokens, SKIPPING tokenization"
-                                    (tokens.clone(), true)
+                                );
-                                } else {
+                                (tokens.clone(), true)
-                                    tracing::warn!(
+                            } else if has_backend_instance_id {
-                                        "backend_instance_id provided but no token_data; tokenizing prompt"
+                                tracing::warn!(
-                                    );
+                                    "backend_instance_id provided but no token_data; tokenizing prompt"
-                                    let encoding = self.encode_with_timing(&prompt, tracker)?;
+                                );
-                                    (encoding.token_ids().to_vec(), false)
+                                let encoding = self.encode_with_timing(&prompt, tracker)?;
-                                }
+                                (encoding.token_ids().to_vec(), false)
                            } else {
-                                // No backend_instance_id provided, continue the normal flow.
                                let encoding = self.encode_with_timing(&prompt, tracker)?;
                                (encoding.token_ids().to_vec(), false)
                            };