Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c36416e5
Unverified
Commit
c36416e5
authored
Feb 22, 2020
by
Joe Davison
Committed by
GitHub
Feb 22, 2020
Browse files
Add standardized get_vocab method to tokenizers
parents
cafc4dfc
197d74f9
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
62 additions
and
0 deletions
+62
-0
src/transformers/tokenization_albert.py
src/transformers/tokenization_albert.py
+5
-0
src/transformers/tokenization_bert.py
src/transformers/tokenization_bert.py
+3
-0
src/transformers/tokenization_ctrl.py
src/transformers/tokenization_ctrl.py
+3
-0
src/transformers/tokenization_gpt2.py
src/transformers/tokenization_gpt2.py
+3
-0
src/transformers/tokenization_openai.py
src/transformers/tokenization_openai.py
+3
-0
src/transformers/tokenization_t5.py
src/transformers/tokenization_t5.py
+5
-0
src/transformers/tokenization_transfo_xl.py
src/transformers/tokenization_transfo_xl.py
+3
-0
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+4
-0
src/transformers/tokenization_xlm.py
src/transformers/tokenization_xlm.py
+3
-0
src/transformers/tokenization_xlm_roberta.py
src/transformers/tokenization_xlm_roberta.py
+5
-0
src/transformers/tokenization_xlnet.py
src/transformers/tokenization_xlnet.py
+5
-0
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+20
-0
No files found.
src/transformers/tokenization_albert.py
View file @
c36416e5
...
...
@@ -114,6 +114,11 @@ class AlbertTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
sp_model
)
def
get_vocab
(
self
):
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
...
...
src/transformers/tokenization_bert.py
View file @
c36416e5
...
...
@@ -195,6 +195,9 @@ class BertTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
vocab
)
def
get_vocab
(
self
):
return
dict
(
self
.
vocab
,
**
self
.
added_tokens_encoder
)
def
_tokenize
(
self
,
text
):
split_tokens
=
[]
if
self
.
do_basic_tokenize
:
...
...
src/transformers/tokenization_ctrl.py
View file @
c36416e5
...
...
@@ -147,6 +147,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
get_vocab
(
self
):
return
dict
(
self
.
encoder
,
**
self
.
added_tokens_encoder
)
def
bpe
(
self
,
token
):
if
token
in
self
.
cache
:
return
self
.
cache
[
token
]
...
...
src/transformers/tokenization_gpt2.py
View file @
c36416e5
...
...
@@ -149,6 +149,9 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
get_vocab
(
self
):
return
dict
(
self
.
encoder
,
**
self
.
added_tokens_encoder
)
def
bpe
(
self
,
token
):
if
token
in
self
.
cache
:
return
self
.
cache
[
token
]
...
...
src/transformers/tokenization_openai.py
View file @
c36416e5
...
...
@@ -125,6 +125,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
get_vocab
(
self
):
return
dict
(
self
.
encoder
,
**
self
.
added_tokens_encoder
)
def
bpe
(
self
,
token
):
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
"</w>"
,)
if
token
in
self
.
cache
:
...
...
src/transformers/tokenization_t5.py
View file @
c36416e5
...
...
@@ -119,6 +119,11 @@ class T5Tokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
self
.
sp_model
.
get_piece_size
()
+
self
.
_extra_ids
def
get_vocab
(
self
):
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
...
...
src/transformers/tokenization_transfo_xl.py
View file @
c36416e5
...
...
@@ -273,6 +273,9 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
idx2sym
)
def
get_vocab
(
self
):
return
dict
(
self
.
sym2idx
,
**
self
.
added_tokens_encoder
)
def
_tokenize
(
self
,
line
,
add_eos
=
False
,
add_double_eos
=
False
):
line
=
line
.
strip
()
# convert to lower case
...
...
src/transformers/tokenization_utils.py
View file @
c36416e5
...
...
@@ -286,6 +286,10 @@ class PreTrainedTokenizer(object):
""" Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
return
self
.
convert_tokens_to_ids
(
self
.
additional_special_tokens
)
def
get_vocab
(
self
):
""" Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """
raise
NotImplementedError
()
def
__init__
(
self
,
max_len
=
None
,
**
kwargs
):
self
.
_bos_token
=
None
self
.
_eos_token
=
None
...
...
src/transformers/tokenization_xlm.py
View file @
c36416e5
...
...
@@ -662,6 +662,9 @@ class XLMTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
get_vocab
(
self
):
return
dict
(
self
.
encoder
,
**
self
.
added_tokens_encoder
)
def
bpe
(
self
,
token
):
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
"</w>"
,)
if
token
in
self
.
cache
:
...
...
src/transformers/tokenization_xlm_roberta.py
View file @
c36416e5
...
...
@@ -190,6 +190,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
sp_model
)
+
len
(
self
.
fairseq_tokens_to_ids
)
def
get_vocab
(
self
):
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
_tokenize
(
self
,
text
):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
...
...
src/transformers/tokenization_xlnet.py
View file @
c36416e5
...
...
@@ -114,6 +114,11 @@ class XLNetTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
len
(
self
.
sp_model
)
def
get_vocab
(
self
):
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
...
...
tests/test_tokenization_common.py
View file @
c36416e5
...
...
@@ -542,3 +542,23 @@ class TokenizerTesterMixin:
print
(
new_tokenizer
.
init_kwargs
)
assert
tokenizer
.
init_kwargs
[
"random_argument"
]
is
True
assert
new_tokenizer
.
init_kwargs
[
"random_argument"
]
is
False
def
test_get_vocab
(
self
):
tokenizer
=
self
.
get_tokenizer
()
vocab
=
tokenizer
.
get_vocab
()
self
.
assertIsInstance
(
vocab
,
dict
)
self
.
assertEqual
(
len
(
vocab
),
len
(
tokenizer
))
for
word
,
ind
in
vocab
.
items
():
self
.
assertEqual
(
tokenizer
.
convert_tokens_to_ids
(
word
),
ind
)
self
.
assertEqual
(
tokenizer
.
convert_ids_to_tokens
(
ind
),
word
)
tokenizer
.
add_tokens
([
"asdfasdfasdfasdf"
])
vocab
=
tokenizer
.
get_vocab
()
self
.
assertIsInstance
(
vocab
,
dict
)
self
.
assertEqual
(
len
(
vocab
),
len
(
tokenizer
))
for
word
,
ind
in
vocab
.
items
():
self
.
assertEqual
(
tokenizer
.
convert_tokens_to_ids
(
word
),
ind
)
self
.
assertEqual
(
tokenizer
.
convert_ids_to_tokens
(
ind
),
word
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment