Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
50e615f4
"...composable_kernel_rocm.git" did not exist on "205e0365e36f9d41479777d4e3b432e39fccf10a"
Unverified
Commit
50e615f4
authored
Aug 30, 2019
by
Thomas Wolf
Committed by
GitHub
Aug 30, 2019
Browse files
Merge branch 'master' into improved_testing
parents
f8aace6b
f7978490
Changes
43
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
20 additions
and
7 deletions
+20
-7
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+12
-7
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+4
-0
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+4
-0
No files found.
pytorch_transformers/tokenization_utils.py
View file @
50e615f4
...
@@ -222,6 +222,9 @@ class PreTrainedTokenizer(object):
...
@@ -222,6 +222,9 @@ class PreTrainedTokenizer(object):
self
.
_additional_special_tokens
=
[]
self
.
_additional_special_tokens
=
[]
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
max_len_single_sentence
=
self
.
max_len
self
.
max_len_sentences_pair
=
self
.
max_len
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_decoder
=
{}
self
.
added_tokens_decoder
=
{}
...
@@ -349,7 +352,7 @@ class PreTrainedTokenizer(object):
...
@@ -349,7 +352,7 @@ class PreTrainedTokenizer(object):
resolved_vocab_files
[
file_id
]
=
None
resolved_vocab_files
[
file_id
]
=
None
else
:
else
:
resolved_vocab_files
[
file_id
]
=
cached_path
(
file_path
,
cache_dir
=
cache_dir
,
force_download
=
force_download
,
proxies
=
proxies
)
resolved_vocab_files
[
file_id
]
=
cached_path
(
file_path
,
cache_dir
=
cache_dir
,
force_download
=
force_download
,
proxies
=
proxies
)
except
EnvironmentError
:
except
EnvironmentError
as
e
:
if
pretrained_model_name_or_path
in
s3_models
:
if
pretrained_model_name_or_path
in
s3_models
:
logger
.
error
(
"Couldn't reach server to download vocabulary."
)
logger
.
error
(
"Couldn't reach server to download vocabulary."
)
else
:
else
:
...
@@ -359,7 +362,7 @@ class PreTrainedTokenizer(object):
...
@@ -359,7 +362,7 @@ class PreTrainedTokenizer(object):
"at this path or url."
.
format
(
"at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
,
str
(
vocab_files
.
keys
())))
pretrained_model_name_or_path
,
str
(
vocab_files
.
keys
())))
r
eturn
Non
e
r
aise
e
for
file_id
,
file_path
in
vocab_files
.
items
():
for
file_id
,
file_path
in
vocab_files
.
items
():
if
file_path
==
resolved_vocab_files
[
file_id
]:
if
file_path
==
resolved_vocab_files
[
file_id
]:
...
@@ -653,10 +656,12 @@ class PreTrainedTokenizer(object):
...
@@ -653,10 +656,12 @@ class PreTrainedTokenizer(object):
return
first_sentence_tokens
,
second_sentence_tokens
return
first_sentence_tokens
,
second_sentence_tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
return
token_ids
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
""" Converts a single index or a sequence of indices (integers) in a token "
...
@@ -699,9 +704,9 @@ class PreTrainedTokenizer(object):
...
@@ -699,9 +704,9 @@ class PreTrainedTokenizer(object):
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
if
self
.
sep_token
is
not
None
and
self
.
sep_token
in
text
:
if
self
.
_
sep_token
is
not
None
and
self
.
_
sep_token
in
text
:
text
=
text
.
replace
(
self
.
cls_token
,
self
.
sep_token
)
text
=
text
.
replace
(
self
.
_
cls_token
,
self
.
_
sep_token
)
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
sep_token
)))
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
_
sep_token
)))
if
clean_up_tokenization_spaces
:
if
clean_up_tokenization_spaces
:
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
return
clean_text
return
clean_text
...
...
pytorch_transformers/tokenization_xlm.py
View file @
50e615f4
...
@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
cls_token
=
cls_token
,
mask_token
=
mask_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
try
:
import
ftfy
import
ftfy
from
spacy.lang.en
import
English
from
spacy.lang.en
import
English
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
50e615f4
...
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
pad_token
=
pad_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
additional_special_tokens
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
try
:
import
sentencepiece
as
spm
import
sentencepiece
as
spm
except
ImportError
:
except
ImportError
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment