Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c665e0fc
Commit
c665e0fc
authored
Aug 30, 2019
by
thomwolf
Browse files
Merge branch 'automodels' of
https://github.com/huggingface/pytorch-transformers
into automodels
parents
447afe9c
9b6e3b34
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
5 deletions
+22
-5
pytorch_transformers/tokenization_transfo_xl.py
pytorch_transformers/tokenization_transfo_xl.py
+4
-0
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+10
-5
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+4
-0
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+4
-0
No files found.
pytorch_transformers/tokenization_transfo_xl.py
View file @
c665e0fc
...
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
...
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
super
(
TransfoXLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
eos_token
=
eos_token
,
super
(
TransfoXLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
eos_token
=
eos_token
,
additional_special_tokens
=
additional_special_tokens
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
if
never_split
is
None
:
if
never_split
is
None
:
never_split
=
self
.
all_special_tokens
never_split
=
self
.
all_special_tokens
if
special
is
None
:
if
special
is
None
:
...
...
pytorch_transformers/tokenization_utils.py
View file @
c665e0fc
...
@@ -166,6 +166,9 @@ class PreTrainedTokenizer(object):
...
@@ -166,6 +166,9 @@ class PreTrainedTokenizer(object):
self
.
_additional_special_tokens
=
[]
self
.
_additional_special_tokens
=
[]
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
max_len_single_sentence
=
self
.
max_len
self
.
max_len_sentences_pair
=
self
.
max_len
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_decoder
=
{}
self
.
added_tokens_decoder
=
{}
...
@@ -590,10 +593,12 @@ class PreTrainedTokenizer(object):
...
@@ -590,10 +593,12 @@ class PreTrainedTokenizer(object):
return
first_sentence_tokens
,
second_sentence_tokens
return
first_sentence_tokens
,
second_sentence_tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
return
token_ids
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
""" Converts a single index or a sequence of indices (integers) in a token "
...
@@ -636,9 +641,9 @@ class PreTrainedTokenizer(object):
...
@@ -636,9 +641,9 @@ class PreTrainedTokenizer(object):
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
if
self
.
sep_token
is
not
None
and
self
.
sep_token
in
text
:
if
self
.
_
sep_token
is
not
None
and
self
.
_
sep_token
in
text
:
text
=
text
.
replace
(
self
.
cls_token
,
self
.
sep_token
)
text
=
text
.
replace
(
self
.
_
cls_token
,
self
.
_
sep_token
)
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
sep_token
)))
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
_
sep_token
)))
if
clean_up_tokenization_spaces
:
if
clean_up_tokenization_spaces
:
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
return
clean_text
return
clean_text
...
...
pytorch_transformers/tokenization_xlm.py
View file @
c665e0fc
...
@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
cls_token
=
cls_token
,
mask_token
=
mask_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
try
:
import
ftfy
import
ftfy
from
spacy.lang.en
import
English
from
spacy.lang.en
import
English
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
c665e0fc
...
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
pad_token
=
pad_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
additional_special_tokens
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
try
:
import
sentencepiece
as
spm
import
sentencepiece
as
spm
except
ImportError
:
except
ImportError
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment