Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
734d29b0
Unverified
Commit
734d29b0
authored
Dec 24, 2019
by
Anthony MOI
Browse files
tokenizers is now a real dependency
parent
2818e505
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
55 additions
and
67 deletions
+55
-67
setup.py
setup.py
+1
-0
src/transformers/tokenization_bert.py
src/transformers/tokenization_bert.py
+35
-41
src/transformers/tokenization_gpt2.py
src/transformers/tokenization_gpt2.py
+19
-26
No files found.
setup.py
View file @
734d29b0
...
@@ -86,6 +86,7 @@ setup(
...
@@ -86,6 +86,7 @@ setup(
packages
=
find_packages
(
"src"
),
packages
=
find_packages
(
"src"
),
install_requires
=
[
install_requires
=
[
"numpy"
,
"numpy"
,
"tokenizers"
,
# accessing files from S3 directly
# accessing files from S3 directly
"boto3"
,
"boto3"
,
# filesystem locks e.g. to prevent parallel downloads
# filesystem locks e.g. to prevent parallel downloads
...
...
src/transformers/tokenization_bert.py
View file @
734d29b0
...
@@ -20,6 +20,8 @@ import logging
...
@@ -20,6 +20,8 @@ import logging
import
os
import
os
import
unicodedata
import
unicodedata
import
tokenizers
as
tk
from
.tokenization_utils
import
FastPreTrainedTokenizer
,
PreTrainedTokenizer
from
.tokenization_utils
import
FastPreTrainedTokenizer
,
PreTrainedTokenizer
...
@@ -552,49 +554,41 @@ class BertTokenizerFast(FastPreTrainedTokenizer):
...
@@ -552,49 +554,41 @@ class BertTokenizerFast(FastPreTrainedTokenizer):
add_special_tokens
=
True
,
add_special_tokens
=
True
,
**
kwargs
**
kwargs
):
):
super
(
BertTokenizerFast
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
try
:
self
.
_tokenizer
=
tk
.
Tokenizer
(
tk
.
models
.
WordPiece
.
from_files
(
vocab_file
,
unk_token
=
unk_token
))
from
tokenizers
import
Tokenizer
,
models
,
pre_tokenizers
,
decoders
,
processors
self
.
_update_special_tokens
()
self
.
_tokenizer
.
with_pre_tokenizer
(
super
(
BertTokenizerFast
,
self
).
__init__
(
tk
.
pre_tokenizers
.
BertPreTokenizer
.
new
(
unk_token
=
unk_token
,
do_basic_tokenize
=
do_basic_tokenize
,
sep_token
=
sep_token
,
do_lower_case
=
do_lower_case
,
pad_token
=
pad_token
,
tokenize_chinese_chars
=
tokenize_chinese_chars
,
cls_token
=
cls_token
,
never_split
=
never_split
if
never_split
is
not
None
else
[],
mask_token
=
mask_token
,
**
kwargs
)
self
.
_tokenizer
=
Tokenizer
(
models
.
WordPiece
.
from_files
(
vocab_file
,
unk_token
=
unk_token
))
self
.
_update_special_tokens
()
self
.
_tokenizer
.
with_pre_tokenizer
(
pre_tokenizers
.
BertPreTokenizer
.
new
(
do_basic_tokenize
=
do_basic_tokenize
,
do_lower_case
=
do_lower_case
,
tokenize_chinese_chars
=
tokenize_chinese_chars
,
never_split
=
never_split
if
never_split
is
not
None
else
[],
)
)
)
self
.
_tokenizer
.
with_decoder
(
decoders
.
WordPiece
.
new
())
)
self
.
_tokenizer
.
with_decoder
(
tk
.
decoders
.
WordPiece
.
new
())
if
add_special_tokens
:
if
add_special_tokens
:
self
.
_tokenizer
.
with_post_processor
(
self
.
_tokenizer
.
with_post_processor
(
processors
.
BertProcessing
.
new
(
tk
.
processors
.
BertProcessing
.
new
(
(
sep_token
,
self
.
_tokenizer
.
token_to_id
(
sep_token
)),
(
sep_token
,
self
.
_tokenizer
.
token_to_id
(
sep_token
)),
(
cls_token
,
self
.
_tokenizer
.
token_to_id
(
cls_token
)),
(
cls_token
,
self
.
_tokenizer
.
token_to_id
(
cls_token
)),
)
)
)
if
max_length
is
not
None
:
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
self
.
_tokenizer
.
with_padding
(
max_length
if
pad_to_max_length
else
None
,
self
.
padding_side
,
self
.
pad_token_id
,
self
.
pad_token_type_id
,
self
.
pad_token
,
)
)
self
.
_decoder
=
decoders
.
WordPiece
.
new
()
if
max_length
is
not
None
:
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
except
(
AttributeError
,
ImportError
)
as
e
:
self
.
_tokenizer
.
with_padding
(
logger
.
error
(
"Make sure you installed `tokenizers` with `pip install tokenizers==0.0.8`"
)
max_length
if
pad_to_max_length
else
None
,
raise
e
self
.
padding_side
,
self
.
pad_token_id
,
self
.
pad_token_type_id
,
self
.
pad_token
,
)
self
.
_decoder
=
tk
.
decoders
.
WordPiece
.
new
()
src/transformers/tokenization_gpt2.py
View file @
734d29b0
...
@@ -21,6 +21,7 @@ import os
...
@@ -21,6 +21,7 @@ import os
from
functools
import
lru_cache
from
functools
import
lru_cache
import
regex
as
re
import
regex
as
re
import
tokenizers
as
tk
from
.tokenization_utils
import
FastPreTrainedTokenizer
,
PreTrainedTokenizer
from
.tokenization_utils
import
FastPreTrainedTokenizer
,
PreTrainedTokenizer
...
@@ -267,29 +268,21 @@ class GPT2TokenizerFast(FastPreTrainedTokenizer):
...
@@ -267,29 +268,21 @@ class GPT2TokenizerFast(FastPreTrainedTokenizer):
truncation_strategy
=
"longest_first"
,
truncation_strategy
=
"longest_first"
,
**
kwargs
**
kwargs
):
):
super
(
GPT2TokenizerFast
,
self
).
__init__
(
try
:
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
from
tokenizers
import
Tokenizer
,
models
,
pre_tokenizers
,
decoders
)
super
(
GPT2TokenizerFast
,
self
).
__init__
(
self
.
_tokenizer
=
tk
.
Tokenizer
(
tk
.
models
.
BPE
.
from_files
(
vocab_file
,
merges_file
))
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
self
.
_update_special_tokens
()
)
self
.
_tokenizer
.
with_pre_tokenizer
(
tk
.
pre_tokenizers
.
ByteLevel
.
new
(
add_prefix_space
))
self
.
_tokenizer
.
with_decoder
(
tk
.
decoders
.
ByteLevel
.
new
())
self
.
_tokenizer
=
Tokenizer
(
models
.
BPE
.
from_files
(
vocab_file
,
merges_file
))
if
max_length
:
self
.
_update_special_tokens
()
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
self
.
_tokenizer
.
with_pre_tokenizer
(
pre_tokenizers
.
ByteLevel
.
new
(
add_prefix_space
))
self
.
_tokenizer
.
with_padding
(
self
.
_tokenizer
.
with_decoder
(
decoders
.
ByteLevel
.
new
())
max_length
if
pad_to_max_length
else
None
,
if
max_length
:
self
.
padding_side
,
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
self
.
pad_token_id
if
self
.
pad_token_id
is
not
None
else
0
,
self
.
_tokenizer
.
with_padding
(
self
.
pad_token_type_id
,
max_length
if
pad_to_max_length
else
None
,
self
.
pad_token
if
self
.
pad_token
is
not
None
else
""
,
self
.
padding_side
,
)
self
.
pad_token_id
if
self
.
pad_token_id
is
not
None
else
0
,
self
.
_decoder
=
tk
.
decoders
.
ByteLevel
.
new
()
self
.
pad_token_type_id
,
self
.
pad_token
if
self
.
pad_token
is
not
None
else
""
,
)
self
.
_decoder
=
decoders
.
ByteLevel
.
new
()
except
(
AttributeError
,
ImportError
)
as
e
:
logger
.
error
(
"Make sure you installed `tokenizers` with `pip install tokenizers==0.0.8`"
)
raise
e
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment