"vscode:/vscode.git/clone" did not exist on "7567adfc454845f9901f9f824c3cbf7edb4a0b29"
Unverified Commit 24247b41 authored by Simo Lin's avatar Simo Lin Committed by GitHub
Browse files

[router] add tokenizer metrics (#9307)


Co-authored-by: default avatarChang Su <chang.s.su@oracle.com>
parent 4c0bb411
...@@ -148,6 +148,94 @@ pub fn init_metrics() { ...@@ -148,6 +148,94 @@ pub fn init_metrics() {
"sgl_router_running_requests", "sgl_router_running_requests",
"Number of running requests per worker" "Number of running requests per worker"
); );
// Tokenizer metrics
describe_histogram!(
"sgl_tokenizer_encode_duration_seconds",
"Time to encode text to tokens"
);
describe_histogram!(
"sgl_tokenizer_decode_duration_seconds",
"Time to decode tokens to text"
);
describe_histogram!(
"sgl_tokenizer_encode_batch_duration_seconds",
"Time to encode a batch of texts"
);
describe_counter!(
"sgl_tokenizer_encode_requests_total",
"Total number of encode requests by tokenizer type"
);
describe_counter!(
"sgl_tokenizer_decode_requests_total",
"Total number of decode requests by tokenizer type"
);
describe_counter!(
"sgl_tokenizer_encode_errors_total",
"Total number of encode errors by error type"
);
describe_counter!(
"sgl_tokenizer_decode_errors_total",
"Total number of decode errors by error type"
);
describe_histogram!(
"sgl_tokenizer_tokens_per_encode",
"Number of tokens produced per encode operation"
);
describe_histogram!(
"sgl_tokenizer_chars_per_encode",
"Number of characters in input text per encode"
);
describe_histogram!(
"sgl_tokenizer_tokens_per_decode",
"Number of tokens decoded per operation"
);
describe_gauge!(
"sgl_tokenizer_vocab_size",
"Vocabulary size of the loaded tokenizer"
);
// Stop sequence detection metrics
describe_counter!(
"sgl_tokenizer_stop_sequences_detected_total",
"Total stop sequences detected by type"
);
describe_counter!(
"sgl_tokenizer_partial_matches_total",
"Total partial stop sequence matches (jailed text)"
);
describe_histogram!(
"sgl_tokenizer_stop_detection_duration_seconds",
"Time to check for stop sequences per token"
);
// Streaming decode metrics
describe_counter!(
"sgl_tokenizer_stream_tokens_total",
"Total tokens processed in streaming decode"
);
describe_counter!(
"sgl_tokenizer_stream_incomplete_utf8_total",
"Total incomplete UTF-8 sequences detected"
);
describe_histogram!(
"sgl_tokenizer_stream_step_duration_seconds",
"Time per streaming decode step"
);
// Factory metrics
describe_counter!(
"sgl_tokenizer_factory_loads_total",
"Total tokenizer loads by file type"
);
describe_counter!(
"sgl_tokenizer_factory_errors_total",
"Total tokenizer loading errors by type"
);
describe_histogram!(
"sgl_tokenizer_factory_load_duration_seconds",
"Time to load and initialize tokenizer"
);
} }
pub fn start_prometheus(config: PrometheusConfig) { pub fn start_prometheus(config: PrometheusConfig) {
...@@ -177,6 +265,8 @@ pub fn start_prometheus(config: PrometheusConfig) { ...@@ -177,6 +265,8 @@ pub fn start_prometheus(config: PrometheusConfig) {
pub struct RouterMetrics; pub struct RouterMetrics;
pub struct TokenizerMetrics;
impl RouterMetrics { impl RouterMetrics {
// Request metrics // Request metrics
pub fn record_request(route: &str) { pub fn record_request(route: &str) {
...@@ -384,6 +474,122 @@ impl RouterMetrics { ...@@ -384,6 +474,122 @@ impl RouterMetrics {
} }
} }
impl TokenizerMetrics {
// Encoding metrics
pub fn record_encode_request(tokenizer_type: &str) {
counter!("sgl_tokenizer_encode_requests_total",
"tokenizer_type" => tokenizer_type.to_string()
)
.increment(1);
}
pub fn record_encode_duration(duration: Duration) {
histogram!("sgl_tokenizer_encode_duration_seconds").record(duration.as_secs_f64());
}
pub fn record_encode_error(error_type: &str) {
counter!("sgl_tokenizer_encode_errors_total",
"error_type" => error_type.to_string()
)
.increment(1);
}
pub fn record_tokens_per_encode(token_count: usize) {
histogram!("sgl_tokenizer_tokens_per_encode").record(token_count as f64);
}
pub fn record_chars_per_encode(char_count: usize) {
histogram!("sgl_tokenizer_chars_per_encode").record(char_count as f64);
}
// Decoding metrics
pub fn record_decode_request(tokenizer_type: &str) {
counter!("sgl_tokenizer_decode_requests_total",
"tokenizer_type" => tokenizer_type.to_string()
)
.increment(1);
}
pub fn record_decode_duration(duration: Duration) {
histogram!("sgl_tokenizer_decode_duration_seconds").record(duration.as_secs_f64());
}
pub fn record_decode_error(error_type: &str) {
counter!("sgl_tokenizer_decode_errors_total",
"error_type" => error_type.to_string()
)
.increment(1);
}
pub fn record_tokens_per_decode(token_count: usize) {
histogram!("sgl_tokenizer_tokens_per_decode").record(token_count as f64);
}
// Batch encoding metrics
pub fn record_encode_batch_duration(duration: Duration, batch_size: usize) {
histogram!("sgl_tokenizer_encode_batch_duration_seconds",
"batch_size" => batch_size.to_string()
)
.record(duration.as_secs_f64());
}
// Stop sequence detection metrics
pub fn record_stop_sequence_detected(stop_type: &str) {
counter!("sgl_tokenizer_stop_sequences_detected_total",
"type" => stop_type.to_string()
)
.increment(1);
}
pub fn record_partial_match() {
counter!("sgl_tokenizer_partial_matches_total").increment(1);
}
pub fn record_stop_detection_duration(duration: Duration) {
histogram!("sgl_tokenizer_stop_detection_duration_seconds").record(duration.as_secs_f64());
}
// Streaming decode metrics
pub fn record_stream_token() {
counter!("sgl_tokenizer_stream_tokens_total").increment(1);
}
pub fn record_incomplete_utf8() {
counter!("sgl_tokenizer_stream_incomplete_utf8_total").increment(1);
}
pub fn record_stream_step_duration(duration: Duration) {
histogram!("sgl_tokenizer_stream_step_duration_seconds").record(duration.as_secs_f64());
}
// Factory metrics
pub fn record_factory_load(file_type: &str) {
counter!("sgl_tokenizer_factory_loads_total",
"file_type" => file_type.to_string()
)
.increment(1);
}
pub fn record_factory_error(error_type: &str) {
counter!("sgl_tokenizer_factory_errors_total",
"error_type" => error_type.to_string()
)
.increment(1);
}
pub fn record_factory_load_duration(duration: Duration) {
histogram!("sgl_tokenizer_factory_load_duration_seconds").record(duration.as_secs_f64());
}
// Vocabulary metrics
pub fn set_vocab_size(tokenizer_type: &str, size: usize) {
gauge!("sgl_tokenizer_vocab_size",
"tokenizer_type" => tokenizer_type.to_string()
)
.set(size as f64);
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
...@@ -646,6 +852,46 @@ mod tests { ...@@ -646,6 +852,46 @@ mod tests {
RouterMetrics::set_running_requests("http://worker1", 15); RouterMetrics::set_running_requests("http://worker1", 15);
} }
#[test]
fn test_tokenizer_metrics_static_methods() {
// Test that all tokenizer metric methods can be called without panic
// Encoding metrics
TokenizerMetrics::record_encode_request("huggingface");
TokenizerMetrics::record_encode_duration(Duration::from_millis(10));
TokenizerMetrics::record_encode_error("invalid_input");
TokenizerMetrics::record_tokens_per_encode(100);
TokenizerMetrics::record_chars_per_encode(500);
// Decoding metrics
TokenizerMetrics::record_decode_request("huggingface");
TokenizerMetrics::record_decode_duration(Duration::from_millis(5));
TokenizerMetrics::record_decode_error("invalid_tokens");
TokenizerMetrics::record_tokens_per_decode(50);
// Batch encoding
TokenizerMetrics::record_encode_batch_duration(Duration::from_millis(100), 10);
// Stop sequence detection
TokenizerMetrics::record_stop_sequence_detected("token");
TokenizerMetrics::record_stop_sequence_detected("string");
TokenizerMetrics::record_partial_match();
TokenizerMetrics::record_stop_detection_duration(Duration::from_micros(100));
// Streaming decode
TokenizerMetrics::record_stream_token();
TokenizerMetrics::record_incomplete_utf8();
TokenizerMetrics::record_stream_step_duration(Duration::from_micros(50));
// Factory metrics
TokenizerMetrics::record_factory_load("json");
TokenizerMetrics::record_factory_error("unsupported_format");
TokenizerMetrics::record_factory_load_duration(Duration::from_millis(200));
// Vocabulary metrics
TokenizerMetrics::set_vocab_size("huggingface", 50000);
}
// ============= Port Availability Tests ============= // ============= Port Availability Tests =============
#[test] #[test]
......
use super::traits; use super::{traits, TokenizerTrait};
use crate::metrics::TokenizerMetrics;
use anyhow::{Error, Result}; use anyhow::{Error, Result};
use std::fs::File; use std::fs::File;
use std::io::Read; use std::io::Read;
use std::path::Path; use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use std::time::Instant;
#[cfg(feature = "huggingface")] #[cfg(feature = "huggingface")]
use super::huggingface::HuggingFaceTokenizer; use super::huggingface::HuggingFaceTokenizer;
...@@ -22,6 +24,8 @@ pub enum TokenizerType { ...@@ -22,6 +24,8 @@ pub enum TokenizerType {
/// - json: HuggingFace tokenizer /// - json: HuggingFace tokenizer
/// - For testing: can return mock tokenizer /// - For testing: can return mock tokenizer
pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tokenizer>> { pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tokenizer>> {
let start_time = Instant::now();
// Special case for testing // Special case for testing
if file_path == "mock" || file_path == "test" { if file_path == "mock" || file_path == "test" {
return Ok(Arc::new(super::mock::MockTokenizer::new())); return Ok(Arc::new(super::mock::MockTokenizer::new()));
...@@ -31,6 +35,7 @@ pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tok ...@@ -31,6 +35,7 @@ pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tok
// Check if file exists // Check if file exists
if !path.exists() { if !path.exists() {
TokenizerMetrics::record_factory_error("file_not_found");
return Err(Error::msg(format!("File not found: {}", file_path))); return Err(Error::msg(format!("File not found: {}", file_path)));
} }
...@@ -40,15 +45,20 @@ pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tok ...@@ -40,15 +45,20 @@ pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tok
.and_then(std::ffi::OsStr::to_str) .and_then(std::ffi::OsStr::to_str)
.map(|s| s.to_lowercase()); .map(|s| s.to_lowercase());
match extension.as_deref() { let result = match extension.as_deref() {
Some("json") => { Some("json") => {
#[cfg(feature = "huggingface")] #[cfg(feature = "huggingface")]
{ {
let tokenizer = HuggingFaceTokenizer::from_file(file_path)?; let tokenizer = HuggingFaceTokenizer::from_file(file_path)?;
Ok(Arc::new(tokenizer))
TokenizerMetrics::record_factory_load("json");
TokenizerMetrics::set_vocab_size("huggingface", tokenizer.vocab_size());
Ok(Arc::new(tokenizer) as Arc<dyn traits::Tokenizer>)
} }
#[cfg(not(feature = "huggingface"))] #[cfg(not(feature = "huggingface"))]
{ {
TokenizerMetrics::record_factory_error("huggingface_disabled");
Err(Error::msg( Err(Error::msg(
"HuggingFace support not enabled. Enable the 'huggingface' feature.", "HuggingFace support not enabled. Enable the 'huggingface' feature.",
)) ))
...@@ -56,17 +66,27 @@ pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tok ...@@ -56,17 +66,27 @@ pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tok
} }
Some("model") => { Some("model") => {
// SentencePiece model file // SentencePiece model file
TokenizerMetrics::record_factory_error("unsupported_sentencepiece");
Err(Error::msg("SentencePiece models not yet supported")) Err(Error::msg("SentencePiece models not yet supported"))
} }
Some("gguf") => { Some("gguf") => {
// GGUF format // GGUF format
TokenizerMetrics::record_factory_error("unsupported_gguf");
Err(Error::msg("GGUF format not yet supported")) Err(Error::msg("GGUF format not yet supported"))
} }
_ => { _ => {
// Try to auto-detect by reading file content // Try to auto-detect by reading file content
auto_detect_tokenizer(file_path) auto_detect_tokenizer(file_path).inspect(|tokenizer| {
TokenizerMetrics::record_factory_load("auto_detected");
TokenizerMetrics::set_vocab_size("auto_detected", tokenizer.vocab_size());
})
} }
};
if result.is_ok() {
TokenizerMetrics::record_factory_load_duration(start_time.elapsed());
} }
result
} }
/// Auto-detect tokenizer type by examining file content /// Auto-detect tokenizer type by examining file content
......
use super::traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait}; use super::traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait};
use crate::metrics::TokenizerMetrics;
use anyhow::{Error, Result}; use anyhow::{Error, Result};
use std::collections::HashMap; use std::collections::HashMap;
use std::time::Instant;
use tokenizers::tokenizer::Tokenizer as HfTokenizer; use tokenizers::tokenizer::Tokenizer as HfTokenizer;
/// HuggingFace tokenizer wrapper /// HuggingFace tokenizer wrapper
...@@ -92,19 +94,36 @@ impl HuggingFaceTokenizer { ...@@ -92,19 +94,36 @@ impl HuggingFaceTokenizer {
impl Encoder for HuggingFaceTokenizer { impl Encoder for HuggingFaceTokenizer {
fn encode(&self, input: &str) -> Result<Encoding> { fn encode(&self, input: &str) -> Result<Encoding> {
let encoding = self let start = Instant::now();
.tokenizer
.encode(input, false) TokenizerMetrics::record_encode_request("huggingface");
.map_err(|e| Error::msg(format!("Encoding failed: {}", e)))?; TokenizerMetrics::record_chars_per_encode(input.len());
Ok(Encoding::Hf(Box::new(encoding))) self.tokenizer
.encode(input, false)
.map_err(|e| {
TokenizerMetrics::record_encode_error("encoding_failed");
Error::msg(format!("Encoding failed: {}", e))
})
.map(|encoding| {
TokenizerMetrics::record_tokens_per_encode(encoding.get_ids().len());
TokenizerMetrics::record_encode_duration(start.elapsed());
Encoding::Hf(Box::new(encoding))
})
} }
fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> { fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
let start = Instant::now();
let encodings = self let encodings = self
.tokenizer .tokenizer
.encode_batch(inputs.to_vec(), false) .encode_batch(inputs.to_vec(), false)
.map_err(|e| Error::msg(format!("Batch encoding failed: {}", e)))?; .map_err(|e| {
TokenizerMetrics::record_encode_error("batch_encoding_failed");
Error::msg(format!("Batch encoding failed: {}", e))
})?;
TokenizerMetrics::record_encode_batch_duration(start.elapsed(), inputs.len());
Ok(encodings Ok(encodings
.into_iter() .into_iter()
...@@ -115,9 +134,20 @@ impl Encoder for HuggingFaceTokenizer { ...@@ -115,9 +134,20 @@ impl Encoder for HuggingFaceTokenizer {
impl Decoder for HuggingFaceTokenizer { impl Decoder for HuggingFaceTokenizer {
fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String> { fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String> {
let start = Instant::now();
TokenizerMetrics::record_decode_request("huggingface");
TokenizerMetrics::record_tokens_per_decode(token_ids.len());
self.tokenizer self.tokenizer
.decode(token_ids, skip_special_tokens) .decode(token_ids, skip_special_tokens)
.map_err(|e| Error::msg(format!("Decoding failed: {}", e))) .map_err(|e| {
TokenizerMetrics::record_decode_error("decoding_failed");
Error::msg(format!("Decoding failed: {}", e))
})
.inspect(|_| {
TokenizerMetrics::record_decode_duration(start.elapsed());
})
} }
} }
......
use super::traits; use super::traits;
use crate::metrics::TokenizerMetrics;
use anyhow::Result; use anyhow::Result;
use std::collections::HashSet; use std::collections::HashSet;
use std::sync::Arc; use std::sync::Arc;
use std::time::Instant;
/// Output from the sequence decoder /// Output from the sequence decoder
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
...@@ -93,6 +95,8 @@ impl StopSequenceDecoder { ...@@ -93,6 +95,8 @@ impl StopSequenceDecoder {
/// Process a single token /// Process a single token
pub fn process_token(&mut self, token_id: u32) -> Result<SequenceDecoderOutput> { pub fn process_token(&mut self, token_id: u32) -> Result<SequenceDecoderOutput> {
let start = Instant::now();
if self.stopped { if self.stopped {
return Ok(SequenceDecoderOutput::Stopped); return Ok(SequenceDecoderOutput::Stopped);
} }
...@@ -100,23 +104,30 @@ impl StopSequenceDecoder { ...@@ -100,23 +104,30 @@ impl StopSequenceDecoder {
// Check for token-level stops first // Check for token-level stops first
if self.config.stop_tokens.contains(&token_id) { if self.config.stop_tokens.contains(&token_id) {
self.stopped = true; self.stopped = true;
TokenizerMetrics::record_stop_sequence_detected("token");
// Flush any jailed text before stopping // Flush any jailed text before stopping
if !self.jail_buffer.is_empty() { if !self.jail_buffer.is_empty() {
let output = self.jail_buffer.clone(); let output = self.jail_buffer.clone();
self.jail_buffer.clear(); self.jail_buffer.clear();
TokenizerMetrics::record_stop_detection_duration(start.elapsed());
return Ok(SequenceDecoderOutput::StoppedWithText(output)); return Ok(SequenceDecoderOutput::StoppedWithText(output));
} }
TokenizerMetrics::record_stop_detection_duration(start.elapsed());
return Ok(SequenceDecoderOutput::Stopped); return Ok(SequenceDecoderOutput::Stopped);
} }
if self.config.visible_stop_tokens.contains(&token_id) { if self.config.visible_stop_tokens.contains(&token_id) {
self.stopped = true; self.stopped = true;
TokenizerMetrics::record_stop_sequence_detected("visible_token");
// Include jailed text plus the stop token // Include jailed text plus the stop token
let stop_text = self let stop_text = self
.tokenizer .tokenizer
.decode(&[token_id], self.skip_special_tokens)?; .decode(&[token_id], self.skip_special_tokens)?;
let output = format!("{}{}", self.jail_buffer, stop_text); let output = format!("{}{}", self.jail_buffer, stop_text);
self.jail_buffer.clear(); self.jail_buffer.clear();
TokenizerMetrics::record_stop_detection_duration(start.elapsed());
return Ok(SequenceDecoderOutput::StoppedWithText(output)); return Ok(SequenceDecoderOutput::StoppedWithText(output));
} }
...@@ -161,9 +172,12 @@ impl StopSequenceDecoder { ...@@ -161,9 +172,12 @@ impl StopSequenceDecoder {
for stop_seq in &self.config.stop_sequences { for stop_seq in &self.config.stop_sequences {
if let Some(pos) = check_text.find(stop_seq) { if let Some(pos) = check_text.find(stop_seq) {
self.stopped = true; self.stopped = true;
TokenizerMetrics::record_stop_sequence_detected("string");
// Output text before the stop sequence // Output text before the stop sequence
let output = check_text[..pos].to_string(); let output = check_text[..pos].to_string();
self.jail_buffer.clear(); self.jail_buffer.clear();
TokenizerMetrics::record_stop_detection_duration(start.elapsed());
return Ok(if output.is_empty() { return Ok(if output.is_empty() {
SequenceDecoderOutput::Stopped SequenceDecoderOutput::Stopped
} else { } else {
...@@ -176,10 +190,13 @@ impl StopSequenceDecoder { ...@@ -176,10 +190,13 @@ impl StopSequenceDecoder {
for stop_seq in &self.config.visible_stop_sequences { for stop_seq in &self.config.visible_stop_sequences {
if let Some(pos) = check_text.find(stop_seq) { if let Some(pos) = check_text.find(stop_seq) {
self.stopped = true; self.stopped = true;
TokenizerMetrics::record_stop_sequence_detected("visible_string");
// Include the stop sequence in output // Include the stop sequence in output
let end_pos = pos + stop_seq.len(); let end_pos = pos + stop_seq.len();
let output = check_text[..end_pos].to_string(); let output = check_text[..end_pos].to_string();
self.jail_buffer.clear(); self.jail_buffer.clear();
TokenizerMetrics::record_stop_detection_duration(start.elapsed());
return Ok(SequenceDecoderOutput::StoppedWithText(output)); return Ok(SequenceDecoderOutput::StoppedWithText(output));
} }
} }
...@@ -202,6 +219,8 @@ impl StopSequenceDecoder { ...@@ -202,6 +219,8 @@ impl StopSequenceDecoder {
} }
if partial_match_len > 0 { if partial_match_len > 0 {
TokenizerMetrics::record_partial_match();
// Split: output safe text, jail the potential match // Split: output safe text, jail the potential match
let safe_end = check_text.len() - partial_match_len; let safe_end = check_text.len() - partial_match_len;
let safe_text = &check_text[..safe_end]; let safe_text = &check_text[..safe_end];
...@@ -211,6 +230,8 @@ impl StopSequenceDecoder { ...@@ -211,6 +230,8 @@ impl StopSequenceDecoder {
self.prefix_offset = self.read_offset; self.prefix_offset = self.read_offset;
self.read_offset = self.token_buffer.len(); self.read_offset = self.token_buffer.len();
TokenizerMetrics::record_stop_detection_duration(start.elapsed());
if safe_text.is_empty() { if safe_text.is_empty() {
Ok(SequenceDecoderOutput::Held) Ok(SequenceDecoderOutput::Held)
} else { } else {
...@@ -224,6 +245,8 @@ impl StopSequenceDecoder { ...@@ -224,6 +245,8 @@ impl StopSequenceDecoder {
self.prefix_offset = self.read_offset; self.prefix_offset = self.read_offset;
self.read_offset = self.token_buffer.len(); self.read_offset = self.token_buffer.len();
TokenizerMetrics::record_stop_detection_duration(start.elapsed());
Ok(SequenceDecoderOutput::Text(check_text)) Ok(SequenceDecoderOutput::Text(check_text))
} }
} }
......
// src/tokenizer/stream.rs // src/tokenizer/stream.rs
use super::traits; use super::traits;
use crate::metrics::TokenizerMetrics;
use anyhow::Result; use anyhow::Result;
use std::sync::Arc; use std::sync::Arc;
use std::time::Instant;
const INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET: usize = 5; const INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET: usize = 5;
...@@ -43,8 +45,12 @@ impl DecodeStream { ...@@ -43,8 +45,12 @@ impl DecodeStream {
/// Step appends a token_id to the internal state and tries to produce a text chunk. /// Step appends a token_id to the internal state and tries to produce a text chunk.
/// Returning `None` means the given id is not enough to produce a chunk. /// Returning `None` means the given id is not enough to produce a chunk.
pub fn step(&mut self, id: u32) -> Result<Option<String>> { pub fn step(&mut self, id: u32) -> Result<Option<String>> {
let start = Instant::now();
self.all_token_ids.push(id); self.all_token_ids.push(id);
TokenizerMetrics::record_stream_token();
let prefix_text = self.tokenizer.decode( let prefix_text = self.tokenizer.decode(
&self.all_token_ids[self.prefix_offset..self.read_offset], &self.all_token_ids[self.prefix_offset..self.read_offset],
self.skip_special_tokens, self.skip_special_tokens,
...@@ -61,8 +67,16 @@ impl DecodeStream { ...@@ -61,8 +67,16 @@ impl DecodeStream {
self.prefix_offset = self.read_offset; self.prefix_offset = self.read_offset;
self.read_offset = self.all_token_ids.len(); self.read_offset = self.all_token_ids.len();
TokenizerMetrics::record_stream_step_duration(start.elapsed());
Ok(Some(new_text)) Ok(Some(new_text))
} else { } else {
if new_text.ends_with("�") {
TokenizerMetrics::record_incomplete_utf8();
}
TokenizerMetrics::record_stream_step_duration(start.elapsed());
Ok(None) Ok(None)
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment