chore: deprecate sentencepiece tokenizer in lib/llm (#2439)

Signed-off-by: lyang24 <lanqingy93@gmail.com>

chore: deprecate sentencepiece tokenizer in lib/llm (#2439)
Signed-off-by: lyang24 <lanqingy93@gmail.com>
e71f71f4 · Lanqing Yang · GitHub · f476fd74 · e71f71f4 · e71f71f4
Unverified Commit e71f71f4 authored Aug 14, 2025 by Lanqing Yang Committed by GitHub Aug 14, 2025
5 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1110,7 +1110,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8030735ecb0d128428b64cd379809817e620a40e5001c54465b99ec5feec2857"
 dependencies = [
 "futures-core",
- "prost 0.13.5",
+ "prost",
 "prost-types",
 "tonic 0.12.3",
 "tracing-core",
@@ -1129,7 +1129,7 @@ dependencies = [
 "hdrhistogram",
 "humantime",
 "hyper-util",
- "prost 0.13.5",
+ "prost",
 "prost-types",
 "serde",
 "serde_json",
@@ -1899,7 +1899,6 @@ dependencies = [
 "rmp-serde",
 "rstest 0.18.2",
 "rstest_reuse",
- "sentencepiece",
 "serde",
 "serde_json",
 "serial_test",
@@ -2225,7 +2224,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "88365f1a5671eb2f7fc240adb216786bc6494b38ce15f1d26ad6eaa303d5e822"
 dependencies = [
 "http 1.3.1",
- "prost 0.13.5",
+ "prost",
 "tokio",
 "tokio-stream",
 "tonic 0.13.1",
@@ -4692,17 +4691,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"

-[[package]]
-name = "num-derive"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.100",
-]
-
 [[package]]
 name = "num-integer"
 version = "0.1.46"
@@ -5322,16 +5310,6 @@ dependencies = [
 "unarray",
 ]

-[[package]]
-name = "prost"
-version = "0.11.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
-dependencies = [
- "bytes",
- "prost-derive 0.11.9",
-]
-
 [[package]]
 name = "prost"
 version = "0.13.5"
@@ -5339,7 +5317,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
 dependencies = [
 "bytes",
- "prost-derive 0.13.5",
+ "prost-derive",
 ]

 [[package]]
@@ -5355,26 +5333,13 @@ dependencies = [
 "once_cell",
 "petgraph",
 "prettyplease",
- "prost 0.13.5",
+ "prost",
 "prost-types",
 "regex",
 "syn 2.0.100",
 "tempfile",
 ]

-[[package]]
-name = "prost-derive"
-version = "0.11.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
-dependencies = [
- "anyhow",
- "itertools 0.10.5",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "prost-derive"
 version = "0.13.5"
@@ -5394,7 +5359,7 @@ version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
 dependencies = [
- "prost 0.13.5",
+ "prost",
 ]

 [[package]]
@@ -6402,32 +6367,6 @@ version = "1.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"

-[[package]]
-name = "sentencepiece"
-version = "0.11.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "286451da14703923eeb9d5e9d7717a15cbf236c037923fb7a6ff911ca45f4124"
-dependencies = [
- "libc",
- "num-derive",
- "num-traits",
- "prost 0.11.9",
- "prost-derive 0.11.9",
- "sentencepiece-sys",
- "thiserror 1.0.69",
-]
-
-[[package]]
-name = "sentencepiece-sys"
-version = "0.11.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a721500103a50c284cd3908cca6c435fcc6a260a1cead830a040f904a89234fb"
-dependencies = [
- "cc",
- "cmake",
- "pkg-config",
-]
-
 [[package]]
 name = "seq-macro"
 version = "0.3.6"
@@ -7641,7 +7580,7 @@ dependencies = [
 "hyper-util",
 "percent-encoding",
 "pin-project",
- "prost 0.13.5",
+ "prost",
 "socket2",
 "tokio",
 "tokio-stream",
@@ -7670,7 +7609,7 @@ dependencies = [
 "hyper-util",
 "percent-encoding",
 "pin-project",
- "prost 0.13.5",
+ "prost",
 "socket2",
 "tokio",
 "tokio-rustls",

--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -34,7 +34,6 @@ testing-full  = ["testing-cuda", "testing-nixl"]
 testing-cuda  = ["dep:cudarc"]
 testing-nixl  = ["dep:nixl-sys"]
 block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:ndarray", "dep:nix"]
-sentencepiece = ["dep:sentencepiece"]
 cuda          = ["dep:cudarc"]
 integration = []

@@ -111,7 +110,6 @@ tokenizers = { version = "0.21.4", default-features = false, features = [
  "esaxx_fast",
  "rustls-tls",
 ] }
-sentencepiece = { version = "0.11.2", optional = true }

 # backend
 galil-seiferas = { version = "0.1" }

--- a/lib/llm/src/tokenizers.rs
+++ b/lib/llm/src/tokenizers.rs
@@ -15,9 +15,6 @@

 pub mod hf;

-#[cfg(feature = "sentencepiece")]
-pub mod sp;
-
 // TODO: Add tokenizer benchmarks
 // TODO: Enable README.md as a module doc
 // #[doc = include_str!("../README.md")]
@@ -31,15 +28,10 @@ pub use anyhow::{Error, Result};

 pub use hf::HuggingFaceTokenizer;

-#[cfg(feature = "sentencepiece")]
-pub use sp::SentencePieceTokenizer;
-
 /// Represents the type of tokenizer being used
 #[derive(Debug)]
 pub enum TokenizerType {
    HuggingFace(String),
-    #[cfg(feature = "sentencepiece")]
-    SentencePiece(String),
 }

 /// character offsets in the original text
@@ -141,7 +133,6 @@ where
 /// The file extension is used to determine the tokenizer type.
 /// Supported file types are:
 /// - json: HuggingFace tokenizer
-/// - model: SentencePiece tokenizer
 pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tokenizer>> {
    let path = Path::new(file_path);
    let extension = path
@@ -154,19 +145,6 @@ pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tok
            let tokenizer = HuggingFaceTokenizer::from_file(file_path)?;
            Ok(Arc::new(tokenizer))
        }
-        "model" => {
-            #[cfg(feature = "sentencepiece")]
-            {
-                let tokenizer = SentencePieceTokenizer::from_file(file_path)?;
-                Ok(Arc::new(tokenizer))
-            }
-            #[cfg(not(feature = "sentencepiece"))]
-            {
-                Err(Error::msg(
-                    "SentencePiece tokenizer not supported".to_string(),
-                ))
-            }
-        }
        _ => Err(Error::msg("Unsupported file type".to_string())),
    }
 }

--- a/lib/llm/src/tokenizers/README.md
+++ b/lib/llm/src/tokenizers/README.md
 # Tokenizers

 ## Introduction
-`tokenizers` is designed for efficient and versatile tokenization in natural language processing. It supports both HuggingFace and SentencePiece models, offering a streamlined API for text encoding and decoding.
+`tokenizers` is designed for efficient and versatile tokenization in natural language processing. It supports both HuggingFace models, offering a streamlined API for text encoding and decoding.

 ## Features
- **Support for HuggingFace and SentencePiece Tokenizers**: Easily integrate popular tokenization models into your NLP projects.
 - **Hash Verification**: Ensures tokenization consistency and accuracy across different models.
 - **Simple Encoding and Decoding**: Facilitates the conversion of text to token IDs and back.
 - **Sequence Management**: Manage sequences of tokens for complex NLP tasks effectively.

--- a/lib/llm/src/tokenizers/sp.rs
+++ b/lib/llm/src/tokenizers/sp.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use crate::tokenizers::{
-    traits::{Decoder, Encoder, Tokenizer},
-    Encoding, Error, Result, TokenIdType,
-};
-
-use sentencepiece::SentencePieceProcessor;
-
-/// A tokenizer implementation using the SentencePiece tokenization algorithm.
-/// This tokenizer can encode text into tokens and decode tokens back into text.
-pub struct SentencePieceTokenizer {
-    /// The underlying SentencePiece processor instance
-    spp: SentencePieceProcessor,
-}
-
-impl SentencePieceTokenizer {
-    /// Creates a new SentencePieceTokenizer from a model file.
-    ///
-    /// # Arguments
-    /// * `tokenizer_name` - Path to the SentencePiece model file
-    ///
-    /// # Returns
-    /// * `Result<Self>` - A new tokenizer instance or an error if loading fails
-    pub fn from_file(tokenizer_name: &str) -> Result<Self> {
-        let spp = SentencePieceProcessor::open(tokenizer_name)
-            .map_err(|err| Error::msg(format!("Error loading tokenizer: {}", err)))?;
-
-        Ok(Self { spp })
-    }
-}
-
-impl Encoder for SentencePieceTokenizer {
-    /// Encodes a string input into tokens using the SentencePiece model.
-    ///
-    /// # Arguments
-    /// * `input` - The text to encode
-    ///
-    /// # Returns
-    /// * `Result<Encoding>` - The encoded tokens, including IDs, text, and character spans
-    fn encode(&self, input: &str) -> Result<Encoding> {
-        let encoding = self
-            .spp
-            .encode(input)
-            .map_err(|err| Error::msg(format!("Error encoding input: {}", err)))?;
-
-        let token_ids = encoding.into_iter().map(|piece| piece.id).collect();
-        Ok(Encoding::Sp(token_ids))
-    }
-
-    /// Encodes multiple string inputs into tokens using the SentencePiece model.
-    ///
-    /// # Arguments
-    /// * `inputs` - The texts to encode
-    ///
-    /// # Returns
-    /// * `Result<Vec<Encoding>>` - The encoded tokens for each input
-    fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
-        inputs.iter().map(|input| self.encode(input)).collect()
-    }
-}
-
-impl Decoder for SentencePieceTokenizer {
-    /// Decodes a sequence of token IDs back into text.
-    ///
-    /// # Arguments
-    /// * `token_ids` - The sequence of token IDs to decode
-    /// * `skip_special_tokens` - Currently unsupported in SentencePieceTokenizer and
-    ///   it will return an error if true
-    ///
-    /// # Returns
-    /// * `Result<String>` - The decoded text
-    ///
-    /// # Errors
-    /// * Returns an error if skip_special_tokens is true
-    /// * Returns an error if the decoding process fails
-    fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String> {
-        if skip_special_tokens {
-            return Err(Error::msg(
-                "SentencePieceTokenizer does not support skip_special_tokens=true.",
-            ));
-        }
-
-        let text = self
-            .spp
-            .decode_piece_ids(token_ids)
-            .map_err(|err| Error::msg(format!("Error decoding input: {}", err)))?;
-
-        Ok(text)
-    }
-}
-
-/// Implement the Tokenizer trait for SentencePieceTokenizer
-impl Tokenizer for SentencePieceTokenizer {}