Unverified Commit b7cf9f43 authored by Anthony MOI's avatar Anthony MOI Committed by GitHub
Browse files

Update tokenizers to 0.7.0-rc5 (#3705)

parent 551b4505
...@@ -178,7 +178,7 @@ ...@@ -178,7 +178,7 @@
"from tokenizers.pre_tokenizers import ByteLevel\n", "from tokenizers.pre_tokenizers import ByteLevel\n",
"\n", "\n",
"# First we create an empty Byte-Pair Encoding model (i.e. not trained model)\n", "# First we create an empty Byte-Pair Encoding model (i.e. not trained model)\n",
"tokenizer = Tokenizer(BPE.empty())\n", "tokenizer = Tokenizer(BPE())\n",
"\n", "\n",
"# Then we enable lower-casing and unicode-normalization\n", "# Then we enable lower-casing and unicode-normalization\n",
"# The Sequence normalizer allows us to combine multiple Normalizer that will be\n", "# The Sequence normalizer allows us to combine multiple Normalizer that will be\n",
...@@ -307,7 +307,7 @@ ...@@ -307,7 +307,7 @@
], ],
"source": [ "source": [
"# Let's tokenizer a simple input\n", "# Let's tokenizer a simple input\n",
"tokenizer.model = BPE.from_files('vocab.json', 'merges.txt')\n", "tokenizer.model = BPE('vocab.json', 'merges.txt')\n",
"encoding = tokenizer.encode(\"This is a simple input to be tokenized\")\n", "encoding = tokenizer.encode(\"This is a simple input to be tokenized\")\n",
"\n", "\n",
"print(\"Encoded string: {}\".format(encoding.tokens))\n", "print(\"Encoded string: {}\".format(encoding.tokens))\n",
......
...@@ -96,7 +96,7 @@ setup( ...@@ -96,7 +96,7 @@ setup(
packages=find_packages("src"), packages=find_packages("src"),
install_requires=[ install_requires=[
"numpy", "numpy",
"tokenizers == 0.7.0rc3", "tokenizers == 0.7.0rc5",
# dataclasses for Python versions that don't have it # dataclasses for Python versions that don't have it
"dataclasses;python_version<'3.7'", "dataclasses;python_version<'3.7'",
# accessing files from S3 directly # accessing files from S3 directly
......
...@@ -265,12 +265,10 @@ class _OpenAIGPTCharBPETokenizer(BaseTokenizer): ...@@ -265,12 +265,10 @@ class _OpenAIGPTCharBPETokenizer(BaseTokenizer):
): ):
if vocab_file is not None and merges_file is not None: if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer( tokenizer = Tokenizer(
BPE.from_files( BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)
vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix
)
) )
else: else:
tokenizer = Tokenizer(BPE.empty()) tokenizer = Tokenizer(BPE())
# Check for Unicode normalization first (before everything else) # Check for Unicode normalization first (before everything else)
normalizers = [] normalizers = []
......
...@@ -362,7 +362,7 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer): ...@@ -362,7 +362,7 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer):
): ):
try: try:
tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) tokenizer = WordLevel(vocab_file, unk_token=unk_token)
tokenizer = Tokenizer(tokenizer) tokenizer = Tokenizer(tokenizer)
except Exception: except Exception:
raise ValueError( raise ValueError(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment