Unverified Commit 76c70f41 authored by Neal Vaidya's avatar Neal Vaidya Committed by GitHub
Browse files

fix: allow having no rust tokenizer when using dyn-chat-processor vllm (#7697)

parent 75c16393
...@@ -474,8 +474,19 @@ impl ModelWatcher { ...@@ -474,8 +474,19 @@ impl ModelWatcher {
None None
}; };
// This is expensive, we are loading ~10MiB JSON, so only do it once // Loading the tokenizer is expensive (~10 MiB JSON), so only do it
let tokenizer = card.tokenizer().context("tokenizer")?; // once and only when a local pipeline actually needs it. Models
// without tokenizer.json (e.g. Qwen3-Omni) set tokenizer = None;
// they rely on a Python chat_engine_factory for tokenization.
// When a chat_engine_factory handles chat and no completions are
// needed, skip tokenizer loading entirely — even if the file exists.
let needs_rust_tokenizer =
needs_local_chat_pipeline || needs_local_completions_pipeline;
let tokenizer = if needs_rust_tokenizer && card.has_tokenizer() {
Some(card.tokenizer().context("tokenizer")?)
} else {
None
};
// Create prefill chooser once if we're building pipelines // Create prefill chooser once if we're building pipelines
// Both chat and completions will share the same prefill chooser instance // Both chat and completions will share the same prefill chooser instance
...@@ -538,6 +549,13 @@ impl ModelWatcher { ...@@ -538,6 +549,13 @@ impl ModelWatcher {
let chat_engine = if let Some(engine) = factory_engine { let chat_engine = if let Some(engine) = factory_engine {
engine engine
} else { } else {
let tk = tokenizer.clone().ok_or_else(|| {
anyhow::anyhow!(
"Model has no supported Rust tokenizer and no chat_engine_factory. \
Use --dyn-chat-processor vllm/sglang or provide a supported \
tokenizer file (tokenizer.json, tiktoken.model, or *.tiktoken)."
)
})?;
entrypoint::build_routed_pipeline::< entrypoint::build_routed_pipeline::<
NvCreateChatCompletionRequest, NvCreateChatCompletionRequest,
NvCreateChatCompletionStreamResponse, NvCreateChatCompletionStreamResponse,
...@@ -548,7 +566,7 @@ impl ModelWatcher { ...@@ -548,7 +566,7 @@ impl ModelWatcher {
self.router_config.router_mode, self.router_config.router_mode,
worker_monitor.clone(), worker_monitor.clone(),
kv_chooser.clone(), kv_chooser.clone(),
tokenizer.clone(), tk,
prefill_chooser.clone(), prefill_chooser.clone(),
self.router_config.enforce_disagg, self.router_config.enforce_disagg,
self.migration_limit, self.migration_limit,
...@@ -561,34 +579,54 @@ impl ModelWatcher { ...@@ -561,34 +579,54 @@ impl ModelWatcher {
tracing::info!("Chat completions is ready"); tracing::info!("Chat completions is ready");
} }
// Add completions engine only if the model supports completions. // Add completions engine only if the model supports completions
// and we have a tokenizer (completions always uses the Rust preprocessor).
if card.model_type.supports_completions() { if card.model_type.supports_completions() {
let formatter = PromptFormatter::no_op(); if let Some(tk) = tokenizer {
let PromptFormatter::OAI(formatter) = formatter; let formatter = PromptFormatter::no_op();
let preprocessor = let PromptFormatter::OAI(formatter) = formatter;
OpenAIPreprocessor::new_with_parts(card.clone(), formatter, tokenizer.clone()) let preprocessor =
.context("OpenAIPreprocessor::new_with_parts")?; OpenAIPreprocessor::new_with_parts(card.clone(), formatter, tk.clone())
let completions_engine = entrypoint::build_routed_pipeline_with_preprocessor::< .context("OpenAIPreprocessor::new_with_parts")?;
NvCreateCompletionRequest, let completions_engine = entrypoint::build_routed_pipeline_with_preprocessor::<
NvCreateCompletionResponse, NvCreateCompletionRequest,
>( NvCreateCompletionResponse,
card, >(
&client, card,
self.manager.clone(), &client,
self.router_config.router_mode, self.manager.clone(),
worker_monitor, self.router_config.router_mode,
kv_chooser, worker_monitor,
preprocessor, kv_chooser,
tokenizer, preprocessor,
prefill_chooser, tk,
self.router_config.enforce_disagg, prefill_chooser,
self.migration_limit, self.router_config.enforce_disagg,
self.metrics.clone(), self.migration_limit,
) self.metrics.clone(),
.await )
.context("build_routed_pipeline_with_preprocessor")?; .await
worker_set.completions_engine = Some(completions_engine); .context("build_routed_pipeline_with_preprocessor")?;
tracing::info!("Completions is ready"); worker_set.completions_engine = Some(completions_engine);
tracing::info!("Completions is ready");
} else {
tracing::warn!(
"Skipping completions engine: no Rust tokenizer available for this model"
);
}
}
// Verify we built at least one serving engine. A Tokens model that
// ends up with no chat AND no completions engine (e.g. completions-only
// model with no tokenizer) should fail fast rather than register an
// empty WorkerSet that can't serve any requests.
if !worker_set.has_decode_engine() {
anyhow::bail!(
"Model '{}' requires frontend tokenization/preprocessing (ModelInput::Tokens) \
but no serving engine could be built. Provide a working tokenizer config or \
perform tokenization in the backend (ModelInput::Text).",
card.name()
);
} }
} else if card.model_input == ModelInput::Text && card.model_type.supports_embedding() { } else if card.model_input == ModelInput::Text && card.model_type.supports_embedding() {
// Case: Text + Embeddings // Case: Text + Embeddings
......
...@@ -452,7 +452,12 @@ impl ModelDeploymentCard { ...@@ -452,7 +452,12 @@ impl ModelDeploymentCard {
} }
None => { None => {
anyhow::bail!( anyhow::bail!(
"Blank ModelDeploymentCard does not have a tokenizer. Is this a mistral model? If so, the `--use-<framework>-tokenizer` flag in the engine command is required." "ModelDeploymentCard for '{}' does not have a tokenizer. \
Provide a supported tokenizer file (tokenizer.json, tiktoken.model, \
or *.tiktoken), use --use-<framework>-tokenizer to delegate \
tokenization to the backend, or use a non-Rust chat processor \
(e.g. --dyn-chat-processor vllm).",
self.display_name
); );
} }
} }
...@@ -676,7 +681,7 @@ impl ModelDeploymentCard { ...@@ -676,7 +681,7 @@ impl ModelDeploymentCard {
let (model_info, tokenizer, gen_config, prompt_formatter) = if !is_mistral_model { let (model_info, tokenizer, gen_config, prompt_formatter) = if !is_mistral_model {
( (
Some(ModelInfoType::from_disk(local_path)?), Some(ModelInfoType::from_disk(local_path)?),
Some(TokenizerKind::from_disk(local_path)?), TokenizerKind::from_disk(local_path)?,
GenerationConfig::from_disk(local_path).ok(), GenerationConfig::from_disk(local_path).ok(),
PromptFormatterArtifact::from_disk(local_path)?, PromptFormatterArtifact::from_disk(local_path)?,
) )
...@@ -1007,29 +1012,45 @@ impl PromptFormatterArtifact { ...@@ -1007,29 +1012,45 @@ impl PromptFormatterArtifact {
} }
impl TokenizerKind { impl TokenizerKind {
pub fn from_disk(directory: &Path) -> Result<Self> { /// Try to discover a tokenizer in the given directory.
///
/// Returns `Ok(Some(..))` when a supported tokenizer is found,
/// `Ok(None)` when no tokenizer files are present (e.g. models that
/// ship only `vocab.json` + `merges.txt`), and `Err` for ambiguous
/// layouts or filesystem failures that should be treated as hard errors.
pub fn from_disk(directory: &Path) -> Result<Option<Self>> {
// Helper: probe a single well-known file. Returns Ok(None) when the
// file simply does not exist, Ok(Some(..)) on success, and Err for
// anything else (unreadable file, checksum failure, etc.).
fn probe(path: std::path::PathBuf) -> Result<Option<CheckedFile>> {
if !path.exists() {
return Ok(None);
}
Ok(Some(CheckedFile::from_disk(path)?))
}
// 1. Try tokenizer.json (HuggingFace) // 1. Try tokenizer.json (HuggingFace)
if let Ok(f) = CheckedFile::from_disk(directory.join("tokenizer.json")) { if let Some(f) = probe(directory.join("tokenizer.json"))? {
return Ok(Self::HfTokenizerJson(f)); return Ok(Some(Self::HfTokenizerJson(f)));
} }
// 2. Try tiktoken.model // 2. Try tiktoken.model
if let Ok(f) = CheckedFile::from_disk(directory.join("tiktoken.model")) { if let Some(f) = probe(directory.join("tiktoken.model"))? {
return Ok(Self::TikTokenModel(f)); return Ok(Some(Self::TikTokenModel(f)));
} }
// 3. Search for any *.tiktoken file // 3. Search for any *.tiktoken file
let tiktoken_files: Vec<_> = std::fs::read_dir(directory) let tiktoken_files: Vec<_> = std::fs::read_dir(directory)
.with_context(|| format!("Failed to read directory {}", directory.display()))?
.collect::<std::io::Result<Vec<_>>>()
.with_context(|| format!("Failed to iterate directory {}", directory.display()))?
.into_iter() .into_iter()
.flatten()
.flatten()
.filter(|entry| entry.path().extension().is_some_and(|e| e == "tiktoken")) .filter(|entry| entry.path().extension().is_some_and(|e| e == "tiktoken"))
.collect(); .collect();
if tiktoken_files.len() == 1 { if tiktoken_files.len() == 1 {
if let Ok(f) = CheckedFile::from_disk(tiktoken_files[0].path()) { let f = CheckedFile::from_disk(tiktoken_files[0].path())?;
return Ok(Self::TikTokenModel(f)); return Ok(Some(Self::TikTokenModel(f)));
}
} else if tiktoken_files.len() > 1 { } else if tiktoken_files.len() > 1 {
let names: Vec<_> = tiktoken_files let names: Vec<_> = tiktoken_files
.iter() .iter()
...@@ -1042,10 +1063,13 @@ impl TokenizerKind { ...@@ -1042,10 +1063,13 @@ impl TokenizerKind {
); );
} }
anyhow::bail!( tracing::warn!(
"No tokenizer.json or tiktoken model file found in {}", "No supported tokenizer found in {} \
(expected tokenizer.json or a tiktoken file). \
Features that depend on the Rust tokenizer will not be available.",
directory.display() directory.display()
) );
Ok(None)
} }
} }
......
{
"architectures": ["Qwen3OmniForConditionalGeneration"],
"bos_token_id": 151643,
"eos_token_id": 151645,
"max_position_embeddings": 32768,
"model_type": "qwen3_omni",
"vocab_size": 151936
}
{
"bos_token": "<|endoftext|>",
"eos_token": "<|im_end|>",
"model_max_length": 32768,
"tokenizer_class": "Qwen2Tokenizer",
"chat_template": "{% for message in messages %}{{ message.content }}{% endfor %}"
}
...@@ -54,3 +54,19 @@ async fn test_missing_required_files() { ...@@ -54,3 +54,19 @@ async fn test_missing_required_files() {
// Should fail because config.json is missing // Should fail because config.json is missing
assert!(err.contains("unable to extract")); assert!(err.contains("unable to extract"));
} }
/// Models without tokenizer.json (e.g. Qwen3-Omni which ships vocab.json + merges.txt)
/// should load successfully with tokenizer set to None. The frontend must use a
/// non-Rust chat processor for these models (e.g. --dyn-chat-processor vllm).
#[tokio::test]
async fn test_model_loads_without_tokenizer_json() {
let path = "tests/data/sample-models/mock-no-tokenizer-json";
let mdc = ModelDeploymentCard::load_from_disk(path, None).unwrap();
assert!(
mdc.tokenizer.is_none(),
"Expected tokenizer to be None for model without tokenizer.json"
);
assert!(!mdc.has_tokenizer(), "has_tokenizer() should be false");
// Model info should still be loaded
assert!(mdc.model_info.is_some());
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment