fix: allow having no rust tokenizer when using dyn-chat-processor vllm (#7697)

76c70f41 · Neal Vaidya · GitHub · 75c16393 · 76c70f41 · 76c70f41
Unverified Commit 76c70f41 authored Mar 31, 2026 by Neal Vaidya Committed by GitHub Mar 31, 2026
5 changed files
--- a/lib/llm/src/discovery/watcher.rs
+++ b/lib/llm/src/discovery/watcher.rs
@@ -474,8 +474,19 @@ impl ModelWatcher {
                None
            };
-            // This is expensive, we are loading ~10MiB JSON, so only do it once
+            // Loading the tokenizer is expensive (~10 MiB JSON), so only do it
-            let tokenizer = card.tokenizer().context("tokenizer")?;
+            // once and only when a local pipeline actually needs it.  Models
+            // without tokenizer.json (e.g. Qwen3-Omni) set tokenizer = None;
+            // they rely on a Python chat_engine_factory for tokenization.
+            // When a chat_engine_factory handles chat and no completions are
+            // needed, skip tokenizer loading entirely — even if the file exists.
+            let needs_rust_tokenizer =
+                needs_local_chat_pipeline || needs_local_completions_pipeline;
+            let tokenizer = if needs_rust_tokenizer && card.has_tokenizer() {
+                Some(card.tokenizer().context("tokenizer")?)
+            } else {
+                None
+            };
            // Create prefill chooser once if we're building pipelines
            // Both chat and completions will share the same prefill chooser instance
@@ -538,6 +549,13 @@ impl ModelWatcher {
                let chat_engine = if let Some(engine) = factory_engine {
                    engine
                } else {
+                    let tk = tokenizer.clone().ok_or_else(|| {
+                        anyhow::anyhow!(
+                            "Model has no supported Rust tokenizer and no chat_engine_factory. \
+                             Use --dyn-chat-processor vllm/sglang or provide a supported \
+                             tokenizer file (tokenizer.json, tiktoken.model, or *.tiktoken)."
+                        )
+                    })?;
                    entrypoint::build_routed_pipeline::<
                        NvCreateChatCompletionRequest,
                        NvCreateChatCompletionStreamResponse,
@@ -548,7 +566,7 @@ impl ModelWatcher {
                        self.router_config.router_mode,
                        worker_monitor.clone(),
                        kv_chooser.clone(),
-                        tokenizer.clone(),
+                        tk,
                        prefill_chooser.clone(),
                        self.router_config.enforce_disagg,
                        self.migration_limit,
@@ -561,34 +579,54 @@ impl ModelWatcher {
                tracing::info!("Chat completions is ready");
            }
-            // Add completions engine only if the model supports completions.
+            // Add completions engine only if the model supports completions
+            // and we have a tokenizer (completions always uses the Rust preprocessor).
            if card.model_type.supports_completions() {
-                let formatter = PromptFormatter::no_op();
+                if let Some(tk) = tokenizer {
-                let PromptFormatter::OAI(formatter) = formatter;
+                    let formatter = PromptFormatter::no_op();
-                let preprocessor =
+                    let PromptFormatter::OAI(formatter) = formatter;
-                    OpenAIPreprocessor::new_with_parts(card.clone(), formatter, tokenizer.clone())
+                    let preprocessor =
-                        .context("OpenAIPreprocessor::new_with_parts")?;
+                        OpenAIPreprocessor::new_with_parts(card.clone(), formatter, tk.clone())
-                let completions_engine = entrypoint::build_routed_pipeline_with_preprocessor::<
+                            .context("OpenAIPreprocessor::new_with_parts")?;
-                    NvCreateCompletionRequest,
+                    let completions_engine = entrypoint::build_routed_pipeline_with_preprocessor::<
-                    NvCreateCompletionResponse,
+                        NvCreateCompletionRequest,
-                >(
+                        NvCreateCompletionResponse,
-                    card,
+                    >(
-                    &client,
+                        card,
-                    self.manager.clone(),
+                        &client,
-                    self.router_config.router_mode,
+                        self.manager.clone(),
-                    worker_monitor,
+                        self.router_config.router_mode,
-                    kv_chooser,
+                        worker_monitor,
-                    preprocessor,
+                        kv_chooser,
-                    tokenizer,
+                        preprocessor,
-                    prefill_chooser,
+                        tk,
-                    self.router_config.enforce_disagg,
+                        prefill_chooser,
-                    self.migration_limit,
+                        self.router_config.enforce_disagg,
-                    self.metrics.clone(),
+                        self.migration_limit,
-                )
+                        self.metrics.clone(),
-                .await
+                    )
-                .context("build_routed_pipeline_with_preprocessor")?;
+                    .await
-                worker_set.completions_engine = Some(completions_engine);
+                    .context("build_routed_pipeline_with_preprocessor")?;
-                tracing::info!("Completions is ready");
+                    worker_set.completions_engine = Some(completions_engine);
+                    tracing::info!("Completions is ready");
+                } else {
+                    tracing::warn!(
+                        "Skipping completions engine: no Rust tokenizer available for this model"
+                    );
+                }
+            }
+            // Verify we built at least one serving engine. A Tokens model that
+            // ends up with no chat AND no completions engine (e.g. completions-only
+            // model with no tokenizer) should fail fast rather than register an
+            // empty WorkerSet that can't serve any requests.
+            if !worker_set.has_decode_engine() {
+                anyhow::bail!(
+                    "Model '{}' requires frontend tokenization/preprocessing (ModelInput::Tokens) \
+                     but no serving engine could be built. Provide a working tokenizer config or \
+                     perform tokenization in the backend (ModelInput::Text).",
+                    card.name()
+                );
            }
        } else if card.model_input == ModelInput::Text && card.model_type.supports_embedding() {
            // Case: Text + Embeddings

--- a/lib/llm/src/model_card.rs
+++ b/lib/llm/src/model_card.rs
@@ -452,7 +452,12 @@ impl ModelDeploymentCard {
            }
            None => {
                anyhow::bail!(
-                    "Blank ModelDeploymentCard does not have a tokenizer. Is this a mistral model? If so, the `--use-<framework>-tokenizer` flag in the engine command is required."
+                    "ModelDeploymentCard for '{}' does not have a tokenizer. \
+                     Provide a supported tokenizer file (tokenizer.json, tiktoken.model, \
+                     or *.tiktoken), use --use-<framework>-tokenizer to delegate \
+                     tokenization to the backend, or use a non-Rust chat processor \
+                     (e.g. --dyn-chat-processor vllm).",
+                    self.display_name
                );
            }
        }
@@ -676,7 +681,7 @@ impl ModelDeploymentCard {
        let (model_info, tokenizer, gen_config, prompt_formatter) = if !is_mistral_model {
            (
                Some(ModelInfoType::from_disk(local_path)?),
-                Some(TokenizerKind::from_disk(local_path)?),
+                TokenizerKind::from_disk(local_path)?,
                GenerationConfig::from_disk(local_path).ok(),
                PromptFormatterArtifact::from_disk(local_path)?,
            )
@@ -1007,29 +1012,45 @@ impl PromptFormatterArtifact {
 }
 impl TokenizerKind {
-    pub fn from_disk(directory: &Path) -> Result<Self> {
+    /// Try to discover a tokenizer in the given directory.
+    ///
+    /// Returns `Ok(Some(..))` when a supported tokenizer is found,
+    /// `Ok(None)` when no tokenizer files are present (e.g. models that
+    /// ship only `vocab.json` + `merges.txt`), and `Err` for ambiguous
+    /// layouts or filesystem failures that should be treated as hard errors.
+    pub fn from_disk(directory: &Path) -> Result<Option<Self>> {
+        // Helper: probe a single well-known file.  Returns Ok(None) when the
+        // file simply does not exist, Ok(Some(..)) on success, and Err for
+        // anything else (unreadable file, checksum failure, etc.).
+        fn probe(path: std::path::PathBuf) -> Result<Option<CheckedFile>> {
+            if !path.exists() {
+                return Ok(None);
+            }
+            Ok(Some(CheckedFile::from_disk(path)?))
+        }
        // 1. Try tokenizer.json (HuggingFace)
-        if let Ok(f) = CheckedFile::from_disk(directory.join("tokenizer.json")) {
+        if let Some(f) = probe(directory.join("tokenizer.json"))? {
-            return Ok(Self::HfTokenizerJson(f));
+            return Ok(Some(Self::HfTokenizerJson(f)));
        }
        // 2. Try tiktoken.model
-        if let Ok(f) = CheckedFile::from_disk(directory.join("tiktoken.model")) {
+        if let Some(f) = probe(directory.join("tiktoken.model"))? {
-            return Ok(Self::TikTokenModel(f));
+            return Ok(Some(Self::TikTokenModel(f)));
        }
        // 3. Search for any *.tiktoken file
        let tiktoken_files: Vec<_> = std::fs::read_dir(directory)
+            .with_context(|| format!("Failed to read directory {}", directory.display()))?
+            .collect::<std::io::Result<Vec<_>>>()
+            .with_context(|| format!("Failed to iterate directory {}", directory.display()))?
            .into_iter()
-            .flatten()
-            .flatten()
            .filter(|entry| entry.path().extension().is_some_and(|e| e == "tiktoken"))
            .collect();
        if tiktoken_files.len() == 1 {
-            if let Ok(f) = CheckedFile::from_disk(tiktoken_files[0].path()) {
+            let f = CheckedFile::from_disk(tiktoken_files[0].path())?;
-                return Ok(Self::TikTokenModel(f));
+            return Ok(Some(Self::TikTokenModel(f)));
-            }
        } else if tiktoken_files.len() > 1 {
            let names: Vec<_> = tiktoken_files
                .iter()
@@ -1042,10 +1063,13 @@ impl TokenizerKind {
            );
        }
-        anyhow::bail!(
+        tracing::warn!(
-            "No tokenizer.json or tiktoken model file found in {}",
+            "No supported tokenizer found in {} \
+             (expected tokenizer.json or a tiktoken file). \
+             Features that depend on the Rust tokenizer will not be available.",
            directory.display()
-        )
+        );
+        Ok(None)
    }
 }

--- a/lib/llm/tests/data/sample-models/mock-no-tokenizer-json/config.json
+++ b/lib/llm/tests/data/sample-models/mock-no-tokenizer-json/config.json
+{
+  "architectures": ["Qwen3OmniForConditionalGeneration"],
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "max_position_embeddings": 32768,
+  "model_type": "qwen3_omni",
+  "vocab_size": 151936
+}
--- a/lib/llm/tests/data/sample-models/mock-no-tokenizer-json/tokenizer_config.json
+++ b/lib/llm/tests/data/sample-models/mock-no-tokenizer-json/tokenizer_config.json
+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|im_end|>",
+  "model_max_length": 32768,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "chat_template": "{% for message in messages %}{{ message.content }}{% endfor %}"
+}
--- a/lib/llm/tests/model_card.rs
+++ b/lib/llm/tests/model_card.rs
@@ -54,3 +54,19 @@ async fn test_missing_required_files() {
    // Should fail because config.json is missing
    assert!(err.contains("unable to extract"));
 }
+/// Models without tokenizer.json (e.g. Qwen3-Omni which ships vocab.json + merges.txt)
+/// should load successfully with tokenizer set to None. The frontend must use a
+/// non-Rust chat processor for these models (e.g. --dyn-chat-processor vllm).
+#[tokio::test]
+async fn test_model_loads_without_tokenizer_json() {
+    let path = "tests/data/sample-models/mock-no-tokenizer-json";
+    let mdc = ModelDeploymentCard::load_from_disk(path, None).unwrap();
+    assert!(
+        mdc.tokenizer.is_none(),
+        "Expected tokenizer to be None for model without tokenizer.json"
+    );
+    assert!(!mdc.has_tokenizer(), "has_tokenizer() should be false");
+    // Model info should still be loaded
+    assert!(mdc.model_info.is_some());
+}