Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
734d29b0
Unverified
Commit
734d29b0
authored
Dec 24, 2019
by
Anthony MOI
Browse files
tokenizers is now a real dependency
parent
2818e505
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
55 additions
and
67 deletions
+55
-67
setup.py
setup.py
+1
-0
src/transformers/tokenization_bert.py
src/transformers/tokenization_bert.py
+35
-41
src/transformers/tokenization_gpt2.py
src/transformers/tokenization_gpt2.py
+19
-26
No files found.
setup.py
View file @
734d29b0
...
@@ -86,6 +86,7 @@ setup(
...
@@ -86,6 +86,7 @@ setup(
packages
=
find_packages
(
"src"
),
packages
=
find_packages
(
"src"
),
install_requires
=
[
install_requires
=
[
"numpy"
,
"numpy"
,
"tokenizers"
,
# accessing files from S3 directly
# accessing files from S3 directly
"boto3"
,
"boto3"
,
# filesystem locks e.g. to prevent parallel downloads
# filesystem locks e.g. to prevent parallel downloads
...
...
src/transformers/tokenization_bert.py
View file @
734d29b0
...
@@ -20,6 +20,8 @@ import logging
...
@@ -20,6 +20,8 @@ import logging
import
os
import
os
import
unicodedata
import
unicodedata
import
tokenizers
as
tk
from
.tokenization_utils
import
FastPreTrainedTokenizer
,
PreTrainedTokenizer
from
.tokenization_utils
import
FastPreTrainedTokenizer
,
PreTrainedTokenizer
...
@@ -552,49 +554,41 @@ class BertTokenizerFast(FastPreTrainedTokenizer):
...
@@ -552,49 +554,41 @@ class BertTokenizerFast(FastPreTrainedTokenizer):
add_special_tokens
=
True
,
add_special_tokens
=
True
,
**
kwargs
**
kwargs
):
):
super
(
BertTokenizerFast
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
try
:
self
.
_tokenizer
=
tk
.
Tokenizer
(
tk
.
models
.
WordPiece
.
from_files
(
vocab_file
,
unk_token
=
unk_token
))
from
tokenizers
import
Tokenizer
,
models
,
pre_tokenizers
,
decoders
,
processors
self
.
_update_special_tokens
()
self
.
_tokenizer
.
with_pre_tokenizer
(
super
(
BertTokenizerFast
,
self
).
__init__
(
tk
.
pre_tokenizers
.
BertPreTokenizer
.
new
(
unk_token
=
unk_token
,
do_basic_tokenize
=
do_basic_tokenize
,
sep_token
=
sep_token
,
do_lower_case
=
do_lower_case
,
pad_token
=
pad_token
,
tokenize_chinese_chars
=
tokenize_chinese_chars
,
cls_token
=
cls_token
,
never_split
=
never_split
if
never_split
is
not
None
else
[],
mask_token
=
mask_token
,
**
kwargs
)
self
.
_tokenizer
=
Tokenizer
(
models
.
WordPiece
.
from_files
(
vocab_file
,
unk_token
=
unk_token
))
self
.
_update_special_tokens
()
self
.
_tokenizer
.
with_pre_tokenizer
(
pre_tokenizers
.
BertPreTokenizer
.
new
(
do_basic_tokenize
=
do_basic_tokenize
,
do_lower_case
=
do_lower_case
,
tokenize_chinese_chars
=
tokenize_chinese_chars
,
never_split
=
never_split
if
never_split
is
not
None
else
[],
)
)
)
self
.
_tokenizer
.
with_decoder
(
decoders
.
WordPiece
.
new
())
)
self
.
_tokenizer
.
with_decoder
(
tk
.
decoders
.
WordPiece
.
new
())
if
add_special_tokens
:
if
add_special_tokens
:
self
.
_tokenizer
.
with_post_processor
(
self
.
_tokenizer
.
with_post_processor
(
processors
.
BertProcessing
.
new
(
tk
.
processors
.
BertProcessing
.
new
(
(
sep_token
,
self
.
_tokenizer
.
token_to_id
(
sep_token
)),
(
sep_token
,
self
.
_tokenizer
.
token_to_id
(
sep_token
)),
(
cls_token
,
self
.
_tokenizer
.
token_to_id
(
cls_token
)),
(
cls_token
,
self
.
_tokenizer
.
token_to_id
(
cls_token
)),
)
)
)
if
max_length
is
not
None
:
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
self
.
_tokenizer
.
with_padding
(
max_length
if
pad_to_max_length
else
None
,
self
.
padding_side
,
self
.
pad_token_id
,
self
.
pad_token_type_id
,
self
.
pad_token
,
)
)
self
.
_decoder
=
decoders
.
WordPiece
.
new
()
if
max_length
is
not
None
:
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
except
(
AttributeError
,
ImportError
)
as
e
:
self
.
_tokenizer
.
with_padding
(
logger
.
error
(
"Make sure you installed `tokenizers` with `pip install tokenizers==0.0.8`"
)
max_length
if
pad_to_max_length
else
None
,
raise
e
self
.
padding_side
,
self
.
pad_token_id
,
self
.
pad_token_type_id
,
self
.
pad_token
,
)
self
.
_decoder
=
tk
.
decoders
.
WordPiece
.
new
()
src/transformers/tokenization_gpt2.py
View file @
734d29b0
...
@@ -21,6 +21,7 @@ import os
...
@@ -21,6 +21,7 @@ import os
from
functools
import
lru_cache
from
functools
import
lru_cache
import
regex
as
re
import
regex
as
re
import
tokenizers
as
tk
from
.tokenization_utils
import
FastPreTrainedTokenizer
,
PreTrainedTokenizer
from
.tokenization_utils
import
FastPreTrainedTokenizer
,
PreTrainedTokenizer
...
@@ -267,29 +268,21 @@ class GPT2TokenizerFast(FastPreTrainedTokenizer):
...
@@ -267,29 +268,21 @@ class GPT2TokenizerFast(FastPreTrainedTokenizer):
truncation_strategy
=
"longest_first"
,
truncation_strategy
=
"longest_first"
,
**
kwargs
**
kwargs
):
):
super
(
GPT2TokenizerFast
,
self
).
__init__
(
try
:
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
from
tokenizers
import
Tokenizer
,
models
,
pre_tokenizers
,
decoders
)
super
(
GPT2TokenizerFast
,
self
).
__init__
(
self
.
_tokenizer
=
tk
.
Tokenizer
(
tk
.
models
.
BPE
.
from_files
(
vocab_file
,
merges_file
))
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
self
.
_update_special_tokens
()
)
self
.
_tokenizer
.
with_pre_tokenizer
(
tk
.
pre_tokenizers
.
ByteLevel
.
new
(
add_prefix_space
))
self
.
_tokenizer
.
with_decoder
(
tk
.
decoders
.
ByteLevel
.
new
())
self
.
_tokenizer
=
Tokenizer
(
models
.
BPE
.
from_files
(
vocab_file
,
merges_file
))
if
max_length
:
self
.
_update_special_tokens
()
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
self
.
_tokenizer
.
with_pre_tokenizer
(
pre_tokenizers
.
ByteLevel
.
new
(
add_prefix_space
))
self
.
_tokenizer
.
with_padding
(
self
.
_tokenizer
.
with_decoder
(
decoders
.
ByteLevel
.
new
())
max_length
if
pad_to_max_length
else
None
,
if
max_length
:
self
.
padding_side
,
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
self
.
pad_token_id
if
self
.
pad_token_id
is
not
None
else
0
,
self
.
_tokenizer
.
with_padding
(
self
.
pad_token_type_id
,
max_length
if
pad_to_max_length
else
None
,
self
.
pad_token
if
self
.
pad_token
is
not
None
else
""
,
self
.
padding_side
,
)
self
.
pad_token_id
if
self
.
pad_token_id
is
not
None
else
0
,
self
.
_decoder
=
tk
.
decoders
.
ByteLevel
.
new
()
self
.
pad_token_type_id
,
self
.
pad_token
if
self
.
pad_token
is
not
None
else
""
,
)
self
.
_decoder
=
decoders
.
ByteLevel
.
new
()
except
(
AttributeError
,
ImportError
)
as
e
:
logger
.
error
(
"Make sure you installed `tokenizers` with `pip install tokenizers==0.0.8`"
)
raise
e
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment