Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
50e615f4
Unverified
Commit
50e615f4
authored
Aug 30, 2019
by
Thomas Wolf
Committed by
GitHub
Aug 30, 2019
Browse files
Merge branch 'master' into improved_testing
parents
f8aace6b
f7978490
Changes
43
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
20 additions
and
7 deletions
+20
-7
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+12
-7
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+4
-0
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+4
-0
No files found.
pytorch_transformers/tokenization_utils.py
View file @
50e615f4
...
...
@@ -222,6 +222,9 @@ class PreTrainedTokenizer(object):
self
.
_additional_special_tokens
=
[]
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
max_len_single_sentence
=
self
.
max_len
self
.
max_len_sentences_pair
=
self
.
max_len
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_decoder
=
{}
...
...
@@ -349,7 +352,7 @@ class PreTrainedTokenizer(object):
resolved_vocab_files
[
file_id
]
=
None
else
:
resolved_vocab_files
[
file_id
]
=
cached_path
(
file_path
,
cache_dir
=
cache_dir
,
force_download
=
force_download
,
proxies
=
proxies
)
except
EnvironmentError
:
except
EnvironmentError
as
e
:
if
pretrained_model_name_or_path
in
s3_models
:
logger
.
error
(
"Couldn't reach server to download vocabulary."
)
else
:
...
...
@@ -359,7 +362,7 @@ class PreTrainedTokenizer(object):
"at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
,
str
(
vocab_files
.
keys
())))
r
eturn
Non
e
r
aise
e
for
file_id
,
file_path
in
vocab_files
.
items
():
if
file_path
==
resolved_vocab_files
[
file_id
]:
...
...
@@ -653,10 +656,12 @@ class PreTrainedTokenizer(object):
return
first_sentence_tokens
,
second_sentence_tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
return
token_ids
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
...
...
@@ -699,9 +704,9 @@ class PreTrainedTokenizer(object):
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
if
self
.
sep_token
is
not
None
and
self
.
sep_token
in
text
:
text
=
text
.
replace
(
self
.
cls_token
,
self
.
sep_token
)
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
sep_token
)))
if
self
.
_
sep_token
is
not
None
and
self
.
_
sep_token
in
text
:
text
=
text
.
replace
(
self
.
_
cls_token
,
self
.
_
sep_token
)
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
_
sep_token
)))
if
clean_up_tokenization_spaces
:
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
return
clean_text
...
...
pytorch_transformers/tokenization_xlm.py
View file @
50e615f4
...
...
@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
import
ftfy
from
spacy.lang.en
import
English
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
50e615f4
...
...
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
import
sentencepiece
as
spm
except
ImportError
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment