Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1f82a5d9
"git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "0d007b7d8f5cc5e23b725f72dd991a401f050dd8"
Unverified
Commit
1f82a5d9
authored
Dec 26, 2019
by
Anthony MOI
Browse files
Update for changes in tokenizers API
parent
734d29b0
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
20 additions
and
16 deletions
+20
-16
setup.py
setup.py
+1
-1
src/transformers/tokenization_bert.py
src/transformers/tokenization_bert.py
+8
-6
src/transformers/tokenization_gpt2.py
src/transformers/tokenization_gpt2.py
+9
-7
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+2
-2
No files found.
setup.py
View file @
1f82a5d9
...
...
@@ -86,7 +86,7 @@ setup(
packages
=
find_packages
(
"src"
),
install_requires
=
[
"numpy"
,
"tokenizers"
,
"tokenizers
== 0.0.10
"
,
# accessing files from S3 directly
"boto3"
,
# filesystem locks e.g. to prevent parallel downloads
...
...
src/transformers/tokenization_bert.py
View file @
1f82a5d9
...
...
@@ -583,12 +583,14 @@ class BertTokenizerFast(FastPreTrainedTokenizer):
)
)
if
max_length
is
not
None
:
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
=
stride
,
strategy
=
truncation_strategy
)
self
.
_tokenizer
.
with_padding
(
max_length
if
pad_to_max_length
else
None
,
self
.
padding_side
,
self
.
pad_token_id
,
self
.
pad_token_type_id
,
self
.
pad_token
,
max_length
=
max_length
if
pad_to_max_length
else
None
,
direction
=
self
.
padding_side
,
pad_id
=
self
.
pad_token_id
,
pad_type_id
=
self
.
pad_token_type_id
,
pad_token
=
self
.
pad_token
,
)
self
.
_decoder
=
tk
.
decoders
.
WordPiece
.
new
()
src/transformers/tokenization_gpt2.py
View file @
1f82a5d9
...
...
@@ -274,15 +274,17 @@ class GPT2TokenizerFast(FastPreTrainedTokenizer):
self
.
_tokenizer
=
tk
.
Tokenizer
(
tk
.
models
.
BPE
.
from_files
(
vocab_file
,
merges_file
))
self
.
_update_special_tokens
()
self
.
_tokenizer
.
with_pre_tokenizer
(
tk
.
pre_tokenizers
.
ByteLevel
.
new
(
add_prefix_space
))
self
.
_tokenizer
.
with_pre_tokenizer
(
tk
.
pre_tokenizers
.
ByteLevel
.
new
(
add_prefix_space
=
add_prefix_space
))
self
.
_tokenizer
.
with_decoder
(
tk
.
decoders
.
ByteLevel
.
new
())
if
max_length
:
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
,
truncation_strategy
)
self
.
_tokenizer
.
with_truncation
(
max_length
,
stride
=
stride
,
strategy
=
truncation_strategy
)
self
.
_tokenizer
.
with_padding
(
max_length
if
pad_to_max_length
else
None
,
self
.
padding_side
,
self
.
pad_token_id
if
self
.
pad_token_id
is
not
None
else
0
,
self
.
pad_token_type_id
,
self
.
pad_token
if
self
.
pad_token
is
not
None
else
""
,
max_length
=
max_length
if
pad_to_max_length
else
None
,
direction
=
self
.
padding_side
,
pad_id
=
self
.
pad_token_id
if
self
.
pad_token_id
is
not
None
else
0
,
pad_type_id
=
self
.
pad_token_type_id
,
pad_token
=
self
.
pad_token
if
self
.
pad_token
is
not
None
else
""
,
)
self
.
_decoder
=
tk
.
decoders
.
ByteLevel
.
new
()
src/transformers/tokenization_utils.py
View file @
1f82a5d9
...
...
@@ -1430,10 +1430,10 @@ class FastPreTrainedTokenizer(PreTrainedTokenizer):
@
property
def
vocab_size
(
self
):
return
self
.
tokenizer
.
get_vocab_size
(
False
)
return
self
.
tokenizer
.
get_vocab_size
(
with_added_tokens
=
False
)
def
__len__
(
self
):
return
self
.
tokenizer
.
get_vocab_size
(
True
)
return
self
.
tokenizer
.
get_vocab_size
(
with_added_tokens
=
True
)
def
_update_special_tokens
(
self
):
self
.
tokenizer
.
add_special_tokens
(
self
.
all_special_tokens
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment