Unverified Commit e71f71f4 authored by Lanqing Yang's avatar Lanqing Yang Committed by GitHub
Browse files

chore: deprecate sentencepiece tokenizer in lib/llm (#2439)


Signed-off-by: default avatarlyang24 <lanqingy93@gmail.com>
parent f476fd74
......@@ -1110,7 +1110,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8030735ecb0d128428b64cd379809817e620a40e5001c54465b99ec5feec2857"
dependencies = [
"futures-core",
"prost 0.13.5",
"prost",
"prost-types",
"tonic 0.12.3",
"tracing-core",
......@@ -1129,7 +1129,7 @@ dependencies = [
"hdrhistogram",
"humantime",
"hyper-util",
"prost 0.13.5",
"prost",
"prost-types",
"serde",
"serde_json",
......@@ -1899,7 +1899,6 @@ dependencies = [
"rmp-serde",
"rstest 0.18.2",
"rstest_reuse",
"sentencepiece",
"serde",
"serde_json",
"serial_test",
......@@ -2225,7 +2224,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88365f1a5671eb2f7fc240adb216786bc6494b38ce15f1d26ad6eaa303d5e822"
dependencies = [
"http 1.3.1",
"prost 0.13.5",
"prost",
"tokio",
"tokio-stream",
"tonic 0.13.1",
......@@ -4692,17 +4691,6 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-derive"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "num-integer"
version = "0.1.46"
......@@ -5322,16 +5310,6 @@ dependencies = [
"unarray",
]
[[package]]
name = "prost"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
dependencies = [
"bytes",
"prost-derive 0.11.9",
]
[[package]]
name = "prost"
version = "0.13.5"
......@@ -5339,7 +5317,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
dependencies = [
"bytes",
"prost-derive 0.13.5",
"prost-derive",
]
[[package]]
......@@ -5355,26 +5333,13 @@ dependencies = [
"once_cell",
"petgraph",
"prettyplease",
"prost 0.13.5",
"prost",
"prost-types",
"regex",
"syn 2.0.100",
"tempfile",
]
[[package]]
name = "prost-derive"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
dependencies = [
"anyhow",
"itertools 0.10.5",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "prost-derive"
version = "0.13.5"
......@@ -5394,7 +5359,7 @@ version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
dependencies = [
"prost 0.13.5",
"prost",
]
[[package]]
......@@ -6402,32 +6367,6 @@ version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
[[package]]
name = "sentencepiece"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "286451da14703923eeb9d5e9d7717a15cbf236c037923fb7a6ff911ca45f4124"
dependencies = [
"libc",
"num-derive",
"num-traits",
"prost 0.11.9",
"prost-derive 0.11.9",
"sentencepiece-sys",
"thiserror 1.0.69",
]
[[package]]
name = "sentencepiece-sys"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a721500103a50c284cd3908cca6c435fcc6a260a1cead830a040f904a89234fb"
dependencies = [
"cc",
"cmake",
"pkg-config",
]
[[package]]
name = "seq-macro"
version = "0.3.6"
......@@ -7641,7 +7580,7 @@ dependencies = [
"hyper-util",
"percent-encoding",
"pin-project",
"prost 0.13.5",
"prost",
"socket2",
"tokio",
"tokio-stream",
......@@ -7670,7 +7609,7 @@ dependencies = [
"hyper-util",
"percent-encoding",
"pin-project",
"prost 0.13.5",
"prost",
"socket2",
"tokio",
"tokio-rustls",
......
......@@ -34,7 +34,6 @@ testing-full = ["testing-cuda", "testing-nixl"]
testing-cuda = ["dep:cudarc"]
testing-nixl = ["dep:nixl-sys"]
block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:ndarray", "dep:nix"]
sentencepiece = ["dep:sentencepiece"]
cuda = ["dep:cudarc"]
integration = []
......@@ -111,7 +110,6 @@ tokenizers = { version = "0.21.4", default-features = false, features = [
"esaxx_fast",
"rustls-tls",
] }
sentencepiece = { version = "0.11.2", optional = true }
# backend
galil-seiferas = { version = "0.1" }
......
......@@ -15,9 +15,6 @@
pub mod hf;
#[cfg(feature = "sentencepiece")]
pub mod sp;
// TODO: Add tokenizer benchmarks
// TODO: Enable README.md as a module doc
// #[doc = include_str!("../README.md")]
......@@ -31,15 +28,10 @@ pub use anyhow::{Error, Result};
pub use hf::HuggingFaceTokenizer;
#[cfg(feature = "sentencepiece")]
pub use sp::SentencePieceTokenizer;
/// Represents the type of tokenizer being used
#[derive(Debug)]
pub enum TokenizerType {
HuggingFace(String),
#[cfg(feature = "sentencepiece")]
SentencePiece(String),
}
/// character offsets in the original text
......@@ -141,7 +133,6 @@ where
/// The file extension is used to determine the tokenizer type.
/// Supported file types are:
/// - json: HuggingFace tokenizer
/// - model: SentencePiece tokenizer
pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tokenizer>> {
let path = Path::new(file_path);
let extension = path
......@@ -154,19 +145,6 @@ pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tok
let tokenizer = HuggingFaceTokenizer::from_file(file_path)?;
Ok(Arc::new(tokenizer))
}
"model" => {
#[cfg(feature = "sentencepiece")]
{
let tokenizer = SentencePieceTokenizer::from_file(file_path)?;
Ok(Arc::new(tokenizer))
}
#[cfg(not(feature = "sentencepiece"))]
{
Err(Error::msg(
"SentencePiece tokenizer not supported".to_string(),
))
}
}
_ => Err(Error::msg("Unsupported file type".to_string())),
}
}
......
# Tokenizers
## Introduction
`tokenizers` is designed for efficient and versatile tokenization in natural language processing. It supports both HuggingFace and SentencePiece models, offering a streamlined API for text encoding and decoding.
`tokenizers` is designed for efficient and versatile tokenization in natural language processing. It supports both HuggingFace models, offering a streamlined API for text encoding and decoding.
## Features
- **Support for HuggingFace and SentencePiece Tokenizers**: Easily integrate popular tokenization models into your NLP projects.
- **Hash Verification**: Ensures tokenization consistency and accuracy across different models.
- **Simple Encoding and Decoding**: Facilitates the conversion of text to token IDs and back.
- **Sequence Management**: Manage sequences of tokens for complex NLP tasks effectively.
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::tokenizers::{
traits::{Decoder, Encoder, Tokenizer},
Encoding, Error, Result, TokenIdType,
};
use sentencepiece::SentencePieceProcessor;
/// A tokenizer implementation using the SentencePiece tokenization algorithm.
/// This tokenizer can encode text into tokens and decode tokens back into text.
pub struct SentencePieceTokenizer {
/// The underlying SentencePiece processor instance
spp: SentencePieceProcessor,
}
impl SentencePieceTokenizer {
/// Creates a new SentencePieceTokenizer from a model file.
///
/// # Arguments
/// * `tokenizer_name` - Path to the SentencePiece model file
///
/// # Returns
/// * `Result<Self>` - A new tokenizer instance or an error if loading fails
pub fn from_file(tokenizer_name: &str) -> Result<Self> {
let spp = SentencePieceProcessor::open(tokenizer_name)
.map_err(|err| Error::msg(format!("Error loading tokenizer: {}", err)))?;
Ok(Self { spp })
}
}
impl Encoder for SentencePieceTokenizer {
/// Encodes a string input into tokens using the SentencePiece model.
///
/// # Arguments
/// * `input` - The text to encode
///
/// # Returns
/// * `Result<Encoding>` - The encoded tokens, including IDs, text, and character spans
fn encode(&self, input: &str) -> Result<Encoding> {
let encoding = self
.spp
.encode(input)
.map_err(|err| Error::msg(format!("Error encoding input: {}", err)))?;
let token_ids = encoding.into_iter().map(|piece| piece.id).collect();
Ok(Encoding::Sp(token_ids))
}
/// Encodes multiple string inputs into tokens using the SentencePiece model.
///
/// # Arguments
/// * `inputs` - The texts to encode
///
/// # Returns
/// * `Result<Vec<Encoding>>` - The encoded tokens for each input
fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
inputs.iter().map(|input| self.encode(input)).collect()
}
}
impl Decoder for SentencePieceTokenizer {
/// Decodes a sequence of token IDs back into text.
///
/// # Arguments
/// * `token_ids` - The sequence of token IDs to decode
/// * `skip_special_tokens` - Currently unsupported in SentencePieceTokenizer and
/// it will return an error if true
///
/// # Returns
/// * `Result<String>` - The decoded text
///
/// # Errors
/// * Returns an error if skip_special_tokens is true
/// * Returns an error if the decoding process fails
fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String> {
if skip_special_tokens {
return Err(Error::msg(
"SentencePieceTokenizer does not support skip_special_tokens=true.",
));
}
let text = self
.spp
.decode_piece_ids(token_ids)
.map_err(|err| Error::msg(format!("Error decoding input: {}", err)))?;
Ok(text)
}
}
/// Implement the Tokenizer trait for SentencePieceTokenizer
impl Tokenizer for SentencePieceTokenizer {}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment