Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c665e0fc
Commit
c665e0fc
authored
Aug 30, 2019
by
thomwolf
Browse files
Merge branch 'automodels' of
https://github.com/huggingface/pytorch-transformers
into automodels
parents
447afe9c
9b6e3b34
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
5 deletions
+22
-5
pytorch_transformers/tokenization_transfo_xl.py
pytorch_transformers/tokenization_transfo_xl.py
+4
-0
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+10
-5
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+4
-0
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+4
-0
No files found.
pytorch_transformers/tokenization_transfo_xl.py
View file @
c665e0fc
...
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
...
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
super
(
TransfoXLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
eos_token
=
eos_token
,
super
(
TransfoXLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
eos_token
=
eos_token
,
additional_special_tokens
=
additional_special_tokens
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
if
never_split
is
None
:
if
never_split
is
None
:
never_split
=
self
.
all_special_tokens
never_split
=
self
.
all_special_tokens
if
special
is
None
:
if
special
is
None
:
...
...
pytorch_transformers/tokenization_utils.py
View file @
c665e0fc
...
@@ -166,6 +166,9 @@ class PreTrainedTokenizer(object):
...
@@ -166,6 +166,9 @@ class PreTrainedTokenizer(object):
self
.
_additional_special_tokens
=
[]
self
.
_additional_special_tokens
=
[]
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
max_len_single_sentence
=
self
.
max_len
self
.
max_len_sentences_pair
=
self
.
max_len
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_decoder
=
{}
self
.
added_tokens_decoder
=
{}
...
@@ -590,10 +593,12 @@ class PreTrainedTokenizer(object):
...
@@ -590,10 +593,12 @@ class PreTrainedTokenizer(object):
return
first_sentence_tokens
,
second_sentence_tokens
return
first_sentence_tokens
,
second_sentence_tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
return
token_ids
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
""" Converts a single index or a sequence of indices (integers) in a token "
...
@@ -636,9 +641,9 @@ class PreTrainedTokenizer(object):
...
@@ -636,9 +641,9 @@ class PreTrainedTokenizer(object):
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
if
self
.
sep_token
is
not
None
and
self
.
sep_token
in
text
:
if
self
.
_
sep_token
is
not
None
and
self
.
_
sep_token
in
text
:
text
=
text
.
replace
(
self
.
cls_token
,
self
.
sep_token
)
text
=
text
.
replace
(
self
.
_
cls_token
,
self
.
_
sep_token
)
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
sep_token
)))
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
_
sep_token
)))
if
clean_up_tokenization_spaces
:
if
clean_up_tokenization_spaces
:
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
return
clean_text
return
clean_text
...
...
pytorch_transformers/tokenization_xlm.py
View file @
c665e0fc
...
@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
cls_token
=
cls_token
,
mask_token
=
mask_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
try
:
import
ftfy
import
ftfy
from
spacy.lang.en
import
English
from
spacy.lang.en
import
English
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
c665e0fc
...
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
pad_token
=
pad_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
additional_special_tokens
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
try
:
import
sentencepiece
as
spm
import
sentencepiece
as
spm
except
ImportError
:
except
ImportError
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment