Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e468192e
Commit
e468192e
authored
Jul 09, 2019
by
thomwolf
Browse files
Merge branch 'pytorch-transformers' into xlnet
parents
9dd2c860
4ce237c8
Changes
84
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
879 additions
and
7 deletions
+879
-7
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+472
-0
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+212
-0
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+188
-0
setup.py
setup.py
+7
-7
No files found.
pytorch_transformers/tokenization_utils.py
0 → 100644
View file @
e468192e
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
logging
import
os
import
json
import
six
from
io
import
open
from
.file_utils
import
cached_path
logger
=
logging
.
getLogger
(
__name__
)
SPECIAL_TOKENS_MAP_FILE
=
'special_tokens_map.json'
ADDED_TOKENS_FILE
=
'added_tokens.json'
class
PreTrainedTokenizer
(
object
):
""" An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
Derived class can set up a few special tokens to be used in common scripts and internals:
bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
additional_special_tokens = []
We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
"""
vocab_files_names
=
{}
pretrained_vocab_files_map
=
{}
max_model_input_sizes
=
{}
SPECIAL_TOKENS_ATTRIBUTES
=
[
"bos_token"
,
"eos_token"
,
"unk_token"
,
"sep_token"
,
"pad_token"
,
"cls_token"
,
"mask_token"
,
"additional_special_tokens"
]
@
property
def
bos_token
(
self
):
if
self
.
_bos_token
is
None
:
logger
.
error
(
"Using bos_token, but it is not set yet."
)
return
self
.
_bos_token
@
property
def
eos_token
(
self
):
if
self
.
_eos_token
is
None
:
logger
.
error
(
"Using eos_token, but it is not set yet."
)
return
self
.
_eos_token
@
property
def
unk_token
(
self
):
if
self
.
_unk_token
is
None
:
logger
.
error
(
"Using unk_token, but it is not set yet."
)
return
self
.
_unk_token
@
property
def
sep_token
(
self
):
if
self
.
_sep_token
is
None
:
logger
.
error
(
"Using sep_token, but it is not set yet."
)
return
self
.
_sep_token
@
property
def
pad_token
(
self
):
if
self
.
_pad_token
is
None
:
logger
.
error
(
"Using pad_token, but it is not set yet."
)
return
self
.
_pad_token
@
property
def
cls_token
(
self
):
if
self
.
_cls_token
is
None
:
logger
.
error
(
"Using cls_token, but it is not set yet."
)
return
self
.
_cls_token
@
property
def
mask_token
(
self
):
if
self
.
_mask_token
is
None
:
logger
.
error
(
"Using mask_token, but it is not set yet."
)
return
self
.
_mask_token
@
property
def
additional_special_tokens
(
self
):
if
self
.
_additional_special_tokens
is
None
:
logger
.
error
(
"Using additional_special_tokens, but it is not set yet."
)
return
self
.
_additional_special_tokens
@
bos_token
.
setter
def
bos_token
(
self
,
value
):
self
.
_bos_token
=
value
@
eos_token
.
setter
def
eos_token
(
self
,
value
):
self
.
_eos_token
=
value
@
unk_token
.
setter
def
unk_token
(
self
,
value
):
self
.
_unk_token
=
value
@
sep_token
.
setter
def
sep_token
(
self
,
value
):
self
.
_sep_token
=
value
@
pad_token
.
setter
def
pad_token
(
self
,
value
):
self
.
_pad_token
=
value
@
cls_token
.
setter
def
cls_token
(
self
,
value
):
self
.
_cls_token
=
value
@
mask_token
.
setter
def
mask_token
(
self
,
value
):
self
.
_mask_token
=
value
@
additional_special_tokens
.
setter
def
additional_special_tokens
(
self
,
value
):
self
.
_additional_special_tokens
=
value
def
__init__
(
self
,
max_len
=
None
,
**
kwargs
):
self
.
_bos_token
=
None
self
.
_eos_token
=
None
self
.
_unk_token
=
None
self
.
_sep_token
=
None
self
.
_pad_token
=
None
self
.
_cls_token
=
None
self
.
_mask_token
=
None
self
.
_additional_special_tokens
=
[]
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_decoder
=
{}
for
key
,
value
in
kwargs
.
items
():
if
key
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
:
setattr
(
self
,
key
,
value
)
@
classmethod
def
from_pretrained
(
cls
,
*
inputs
,
**
kwargs
):
return
cls
.
_from_pretrained
(
*
inputs
,
**
kwargs
)
@
classmethod
def
_from_pretrained
(
cls
,
pretrained_model_name_or_path
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
"""
Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
Download and cache the vocabulary files if needed.
"""
s3_models
=
list
(
cls
.
max_model_input_sizes
.
keys
())
vocab_files
=
{}
if
pretrained_model_name_or_path
in
s3_models
:
for
file_id
,
map_list
in
cls
.
pretrained_vocab_files_map
.
items
():
vocab_files
[
file_id
]
=
map_list
[
pretrained_model_name_or_path
]
else
:
all_vocab_files_names
=
{
'added_tokens_file'
:
ADDED_TOKENS_FILE
,
'special_tokens_map_file'
:
SPECIAL_TOKENS_MAP_FILE
}
all_vocab_files_names
.
update
(
cls
.
vocab_files_names
)
for
file_id
,
file_name
in
all_vocab_files_names
.
items
():
if
os
.
path
.
isdir
(
pretrained_model_name_or_path
):
full_file_name
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
file_name
)
else
:
full_file_name
=
pretrained_model_name_or_path
if
not
os
.
path
.
exists
(
full_file_name
):
logger
.
info
(
"Didn't find file {}. We won't load it."
.
format
(
full_file_name
))
full_file_name
=
None
vocab_files
[
file_id
]
=
full_file_name
# Get files from url, cache, or disk depending on the case
try
:
resolved_vocab_files
=
{}
for
file_id
,
file_path
in
vocab_files
.
items
():
if
file_path
is
None
:
resolved_vocab_files
[
file_id
]
=
None
else
:
resolved_vocab_files
[
file_id
]
=
cached_path
(
file_path
,
cache_dir
=
cache_dir
)
except
EnvironmentError
:
if
pretrained_model_name_or_path
in
s3_models
:
logger
.
error
(
"Couldn't reach server to download vocabulary."
)
else
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {} "
"at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
,
str
(
vocab_files
.
keys
())))
return
None
for
file_id
,
file_path
in
vocab_files
.
items
():
if
file_path
==
resolved_vocab_files
[
file_id
]:
logger
.
info
(
"loading file {}"
.
format
(
file_path
))
else
:
logger
.
info
(
"loading file {} from cache at {}"
.
format
(
file_path
,
resolved_vocab_files
[
file_id
]))
# Set max length if needed
if
pretrained_model_name_or_path
in
cls
.
max_model_input_sizes
:
# if we're using a pretrained model, ensure the tokenizer
# wont index sequences longer than the number of positional embeddings
max_len
=
cls
.
max_model_input_sizes
[
pretrained_model_name_or_path
]
kwargs
[
'max_len'
]
=
min
(
kwargs
.
get
(
'max_len'
,
int
(
1e12
)),
max_len
)
# Merge resolved_vocab_files arguments in kwargs.
added_tokens_file
=
resolved_vocab_files
.
pop
(
'added_tokens_file'
,
None
)
special_tokens_map_file
=
resolved_vocab_files
.
pop
(
'special_tokens_map_file'
,
None
)
for
args_name
,
file_path
in
resolved_vocab_files
.
items
():
if
args_name
not
in
kwargs
:
kwargs
[
args_name
]
=
file_path
if
special_tokens_map_file
is
not
None
:
special_tokens_map
=
json
.
load
(
open
(
special_tokens_map_file
,
encoding
=
"utf-8"
))
for
key
,
value
in
special_tokens_map
.
items
():
if
key
not
in
kwargs
:
kwargs
[
key
]
=
value
# Instantiate tokenizer.
tokenizer
=
cls
(
*
inputs
,
**
kwargs
)
# Add supplementary tokens.
if
added_tokens_file
is
not
None
:
added_tok_encoder
=
json
.
load
(
open
(
added_tokens_file
,
encoding
=
"utf-8"
))
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
tokenizer
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
tokenizer
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
return
tokenizer
def
save_pretrained
(
self
,
save_directory
):
""" Save the tokenizer vocabulary files (with added tokens) and the
special-tokens-to-class-attributes-mapping to a directory, so that it
can be re-loaded using the `from_pretrained(save_directory)` class method.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Saving directory ({}) should be a directory"
.
format
(
save_directory
))
return
special_tokens_map_file
=
os
.
path
.
join
(
save_directory
,
SPECIAL_TOKENS_MAP_FILE
)
added_tokens_file
=
os
.
path
.
join
(
save_directory
,
ADDED_TOKENS_FILE
)
with
open
(
special_tokens_map_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
special_tokens_map
,
ensure_ascii
=
False
))
with
open
(
added_tokens_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
if
self
.
added_tokens_encoder
:
out_str
=
json
.
dumps
(
self
.
added_tokens_decoder
,
ensure_ascii
=
False
)
else
:
out_str
=
u
"{}"
f
.
write
(
out_str
)
vocab_files
=
self
.
save_vocabulary
(
save_directory
)
return
vocab_files
+
(
special_tokens_map_file
,
added_tokens_file
)
def
save_vocabulary
(
self
,
save_directory
):
""" Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
and special token mappings.
Please use `save_pretrained()` to save the full Tokenizer state so that it can be
reloaded using the `from_pretrained(save_directory)` class method.
"""
raise
NotImplementedError
def
vocab_size
(
self
):
raise
NotImplementedError
def
__len__
(
self
):
return
self
.
vocab_size
+
len
(
self
.
added_tokens_encoder
)
def
add_tokens
(
self
,
new_tokens
):
""" Add a list of new tokens to the tokenizer class. If the new tokens are not in the
vocabulary, they are added to the added_tokens_encoder with indices starting from
the last index of the current vocabulary.
Returns:
Number of tokens added to the vocabulary which can be used to correspondingly
increase the size of the associated model embedding matrices.
"""
if
not
new_tokens
:
return
0
to_add_tokens
=
[]
for
token
in
new_tokens
:
if
self
.
convert_tokens_to_ids
(
token
)
==
self
.
convert_tokens_to_ids
(
self
.
unk_token
):
to_add_tokens
.
append
(
token
)
logger
.
info
(
"Adding %s to the vocabulary"
,
token
)
added_tok_encoder
=
dict
((
tok
,
len
(
self
)
+
i
)
for
i
,
tok
in
enumerate
(
to_add_tokens
))
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
self
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
self
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
return
len
(
to_add_tokens
)
def
add_special_tokens
(
self
,
special_tokens_dict
):
""" Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
to class attributes. If the special tokens are not in the vocabulary, they are added
to it and indexed starting from the last index of the current vocabulary.
Returns:
Number of tokens added to the vocabulary which can be used to correspondingly
increase the size of the associated model embedding matrices.
"""
if
not
special_tokens_dict
:
return
0
added_special_tokens
=
self
.
add_tokens
(
special_tokens_dict
.
values
())
for
key
,
value
in
special_tokens_dict
.
items
():
logger
.
info
(
"Assigning %s to the %s key of the tokenizer"
,
value
,
key
)
setattr
(
self
,
key
,
value
)
return
added_special_tokens
def
tokenize
(
self
,
text
,
**
kwargs
):
""" Converts a string in a sequence of tokens (string), using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based
vocabularies (BPE/SentencePieces/WordPieces).
Take care of added tokens.
"""
def
split_on_tokens
(
tok_list
,
text
):
if
not
text
:
return
[]
if
not
tok_list
:
return
self
.
_tokenize
(
text
,
**
kwargs
)
tok
=
tok_list
[
0
]
split_text
=
text
.
split
(
tok
)
return
sum
((
split_on_tokens
(
tok_list
[
1
:],
sub_text
.
strip
())
+
[
tok
]
\
for
sub_text
in
split_text
),
[])[:
-
1
]
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
return
tokenized_text
def
_tokenize
(
self
,
text
,
**
kwargs
):
""" Converts a string in a sequence of tokens (string), using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based
vocabularies (BPE/SentencePieces/WordPieces).
Don't take care of added tokens.
"""
raise
NotImplementedError
def
convert_tokens_to_ids
(
self
,
tokens
):
""" Converts a single token or a sequence of tokens (str/unicode) in a integer id
(resp.) a sequence of ids, using the vocabulary.
"""
if
isinstance
(
tokens
,
str
)
or
(
six
.
PY2
and
isinstance
(
tokens
,
unicode
)):
return
self
.
convert_token_to_id_with_added_voc
(
tokens
)
ids
=
[]
for
token
in
tokens
:
ids
.
append
(
self
.
convert_token_to_id_with_added_voc
(
token
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
ids
def
convert_token_to_id_with_added_voc
(
self
,
token
):
if
token
in
self
.
added_tokens_encoder
:
return
self
.
added_tokens_encoder
[
token
]
return
self
.
_convert_token_to_id
(
token
)
def
_convert_token_to_id
(
self
,
token
):
raise
NotImplementedError
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
Args:
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
"""
if
isinstance
(
ids
,
int
):
return
self
.
convert_id_to_token
(
ids
)
tokens
=
[]
for
index
in
ids
:
if
index
in
self
.
all_special_ids
and
skip_special_tokens
:
continue
if
index
in
self
.
added_tokens_decoder
:
tokens
.
append
(
self
.
added_tokens_decoder
[
index
])
else
:
tokens
.
append
(
self
.
_convert_id_to_token
(
index
))
return
tokens
def
_convert_id_to_token
(
self
,
index
):
raise
NotImplementedError
def
encode
(
self
,
text
):
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
same as self.convert_tokens_to_ids(self.tokenize(text)).
"""
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
decode
(
self
,
token_ids
,
skip_special_tokens
=
False
,
clean_up_tokenization_spaces
=
True
):
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
"""
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
text
=
self
.
_convert_ids_to_string
(
filtered_tokens
)
if
clean_up_tokenization_spaces
:
text
=
clean_up_tokenization
(
text
)
return
text
def
_convert_ids_to_string
(
self
,
tokens_ids
):
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary.
roughtly same as ' '.join(self.convert_ids_to_tokens(token_ids)).
"""
return
' '
.
join
(
self
.
convert_ids_to_tokens
(
tokens_ids
))
@
property
def
special_tokens_map
(
self
):
""" A dictionary mapping special token class attribute (cls_token, unk_token...) to their
values ('<unk>', '<cls>'...)
"""
set_attr
=
{}
for
attr
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
:
attr_value
=
getattr
(
self
,
"_"
+
attr
)
if
attr_value
:
set_attr
[
attr
]
=
attr_value
return
set_attr
@
property
def
all_special_tokens
(
self
):
""" List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
(cls_token, unk_token...).
"""
all_toks
=
[]
set_attr
=
self
.
special_tokens_map
for
attr_value
in
set_attr
.
values
():
all_toks
=
all_toks
+
(
attr_value
if
isinstance
(
attr_value
,
(
list
,
tuple
))
else
[
attr_value
])
all_toks
=
list
(
set
(
all_toks
))
return
all_toks
@
property
def
all_special_ids
(
self
):
""" List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
class attributes (cls_token, unk_token...).
"""
all_toks
=
self
.
all_special_tokens
all_ids
=
list
(
self
.
convert_tokens_to_ids
(
t
)
for
t
in
all_toks
)
return
all_ids
def
clean_up_tokenization
(
out_string
):
out_string
.
replace
(
' .'
,
'.'
).
replace
(
' ?'
,
'?'
).
replace
(
' !'
,
'!'
).
replace
(
' ,'
,
','
).
replace
(
" ' "
,
"'"
).
replace
(
" n't"
,
"n't"
).
replace
(
" 'm"
,
"'m"
).
replace
(
" do not"
,
" don't"
).
replace
(
" 's"
,
"'s"
).
replace
(
" 've"
,
"'ve"
).
replace
(
" 're"
,
"'re"
)
return
out_string
pytorch_
pretrained_b
er
t
/tokenization_xlm.py
→
pytorch_
transform
er
s
/tokenization_xlm.py
View file @
e468192e
...
@@ -20,36 +20,31 @@ import json
...
@@ -20,36 +20,31 @@ import json
import
logging
import
logging
import
os
import
os
import
re
import
re
import
sys
from
io
import
open
from
io
import
open
from
tqdm
import
tqdm
from
.tokenization_utils
import
PreTrainedTokenizer
from
.file_utils
import
cached_path
from
.model_utils
import
clean_up_tokenization
from
.tokenization_bert
import
BasicTokenizer
from
.tokenization_bert
import
BasicTokenizer
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_VOCAB_ARCHIVE_MAP
=
{
VOCAB_FILES_NAMES
=
{
'xlm-mlm-en-2048'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json"
,
'vocab_file'
:
'vocab.json'
,
}
'merges_file'
:
'merges.txt'
,
PRETRAINED_MERGES_ARCHIVE_MAP
=
{
'xlm-mlm-en-2048'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt"
,
}
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
=
{
'xlm-mlm-en-2048'
:
512
,
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'xlm-mlm-en-2048'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json"
,
},
'merges_file'
:
{
'xlm-mlm-en-2048'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt"
,
},
}
}
VOCAB_NAME
=
'vocab.json'
MERGES_NAME
=
'merges.txt'
SPECIAL_TOKENS_NAME
=
'special_tokens.txt'
INDEX
=
{
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
"bos_index"
:
0
,
'xlm-mlm-en-2048'
:
512
,
"eos_index"
:
1
,
"pad_index"
:
2
,
"unk_index"
:
3
,
"mask_index"
:
5
}
}
def
get_pairs
(
word
):
def
get_pairs
(
word
):
...
@@ -79,7 +74,7 @@ def text_standardize(text):
...
@@ -79,7 +74,7 @@ def text_standardize(text):
text
=
re
.
sub
(
r
'[^\S\n]+'
,
' '
,
text
)
text
=
re
.
sub
(
r
'[^\S\n]+'
,
' '
,
text
)
return
text
.
strip
()
return
text
.
strip
()
class
XLMTokenizer
(
object
):
class
XLMTokenizer
(
PreTrainedTokenizer
):
"""
"""
BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
- lower case all inputs
- lower case all inputs
...
@@ -87,65 +82,20 @@ class XLMTokenizer(object):
...
@@ -87,65 +82,20 @@ class XLMTokenizer(object):
- argument special_tokens and function set_special_tokens:
- argument special_tokens and function set_special_tokens:
can be used to add additional symbols (ex: "__classify__") to a vocabulary.
can be used to add additional symbols (ex: "__classify__") to a vocabulary.
"""
"""
@
classmethod
vocab_files_names
=
VOCAB_FILES_NAMES
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
"""
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
bos_token
=
"<s>"
,
"""
sep_token
=
"</s>"
,
pad_token
=
"<pad>"
,
cls_token
=
"</s>"
,
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
mask_token
=
"<special1>"
,
additional_special_tokens
=
[
"<special0>"
,
vocab_file
=
PRETRAINED_VOCAB_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
"<special1>"
,
"<special2>"
,
"<special3>"
,
"<special4>"
,
"<special5>"
,
merges_file
=
PRETRAINED_MERGES_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
"<special6>"
,
"<special7>"
,
"<special8>"
,
"<special9>"
],
**
kwargs
):
special_tokens_file
=
None
super
(
XLMTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
bos_token
=
bos_token
,
else
:
sep_token
=
sep_token
,
pad_token
=
pad_token
,
vocab_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
VOCAB_NAME
)
cls_token
=
cls_token
,
mask_token
=
mask_token
,
merges_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
MERGES_NAME
)
additional_special_tokens
=
additional_special_tokens
,
special_tokens_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
SPECIAL_TOKENS_NAME
)
**
kwargs
)
if
not
os
.
path
.
exists
(
special_tokens_file
):
special_tokens_file
=
None
else
:
logger
.
info
(
"loading special tokens file {}"
.
format
(
special_tokens_file
))
# redirect to the cache, if necessary
try
:
resolved_vocab_file
=
cached_path
(
vocab_file
,
cache_dir
=
cache_dir
)
resolved_merges_file
=
cached_path
(
merges_file
,
cache_dir
=
cache_dir
)
except
EnvironmentError
:
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
logger
.
error
(
"Couldn't reach server at '{}' to download vocabulary."
.
format
(
vocab_file
))
else
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
()),
pretrained_model_name_or_path
,
vocab_file
,
merges_file
))
return
None
if
resolved_vocab_file
==
vocab_file
and
resolved_merges_file
==
merges_file
:
logger
.
info
(
"loading vocabulary file {}"
.
format
(
vocab_file
))
logger
.
info
(
"loading merges file {}"
.
format
(
merges_file
))
else
:
logger
.
info
(
"loading vocabulary file {} from cache at {}"
.
format
(
vocab_file
,
resolved_vocab_file
))
logger
.
info
(
"loading merges file {} from cache at {}"
.
format
(
merges_file
,
resolved_merges_file
))
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len
=
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
[
pretrained_model_name_or_path
]
kwargs
[
'max_len'
]
=
min
(
kwargs
.
get
(
'max_len'
,
int
(
1e12
)),
max_len
)
# Instantiate tokenizer.
if
special_tokens_file
and
'special_tokens'
not
in
kwargs
:
special_tokens
=
open
(
special_tokens_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
else
:
special_tokens
=
kwargs
.
pop
(
'special_tokens'
,
[])
tokenizer
=
cls
(
resolved_vocab_file
,
resolved_merges_file
,
special_tokens
=
special_tokens
,
*
inputs
,
**
kwargs
)
return
tokenizer
def
__init__
(
self
,
vocab_file
,
merges_file
,
special_tokens
=
None
,
max_len
=
None
):
try
:
try
:
import
ftfy
import
ftfy
import
spacy
import
spacy
...
@@ -153,39 +103,19 @@ class XLMTokenizer(object):
...
@@ -153,39 +103,19 @@ class XLMTokenizer(object):
self
.
fix_text
=
ftfy
.
fix_text
self
.
fix_text
=
ftfy
.
fix_text
except
ImportError
:
except
ImportError
:
logger
.
warning
(
"ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy."
)
logger
.
warning
(
"ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy."
)
self
.
nlp
=
BasicTokenizer
(
do_lower_case
=
True
,
self
.
nlp
=
BasicTokenizer
(
do_lower_case
=
True
)
never_split
=
special_tokens
if
special_tokens
is
not
None
else
[])
self
.
fix_text
=
None
self
.
fix_text
=
None
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
merges
=
[
tuple
(
merge
.
split
()[:
2
])
for
merge
in
merges
]
merges
=
[
tuple
(
merge
.
split
()[:
2
])
for
merge
in
merges
]
self
.
bpe_ranks
=
dict
(
zip
(
merges
,
range
(
len
(
merges
))))
self
.
bpe_ranks
=
dict
(
zip
(
merges
,
range
(
len
(
merges
))))
self
.
cache
=
{}
self
.
cache
=
{}
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
self
.
set_special_tokens
(
special_tokens
)
def
__len__
(
self
):
@
property
return
len
(
self
.
encoder
)
+
len
(
self
.
special_tokens
)
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
set_special_tokens
(
self
,
special_tokens
):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if
not
special_tokens
:
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
return
self
.
special_tokens
=
dict
((
tok
,
len
(
self
.
encoder
)
+
i
)
for
i
,
tok
in
enumerate
(
special_tokens
))
self
.
special_tokens_decoder
=
{
v
:
k
for
k
,
v
in
self
.
special_tokens
.
items
()}
if
self
.
fix_text
is
None
:
# Using BERT's BasicTokenizer: we can update the tokenizer
self
.
nlp
.
never_split
=
special_tokens
logger
.
info
(
"Special tokens {}"
.
format
(
self
.
special_tokens
))
def
bpe
(
self
,
token
):
def
bpe
(
self
,
token
):
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
'</w>'
,)
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
'</w>'
,)
...
@@ -230,7 +160,7 @@ class XLMTokenizer(object):
...
@@ -230,7 +160,7 @@ class XLMTokenizer(object):
self
.
cache
[
token
]
=
word
self
.
cache
[
token
]
=
word
return
word
return
word
def
tokenize
(
self
,
text
):
def
_
tokenize
(
self
,
text
):
""" Tokenize a string. """
""" Tokenize a string. """
split_tokens
=
[]
split_tokens
=
[]
if
self
.
fix_text
is
None
:
if
self
.
fix_text
is
None
:
...
@@ -245,58 +175,26 @@ class XLMTokenizer(object):
...
@@ -245,58 +175,26 @@ class XLMTokenizer(object):
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
return
split_tokens
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a sequence of tokens into ids using the vocab. """
""" Converts a token (str/unicode) in an id using the vocab. """
ids
=
[]
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
if
isinstance
(
tokens
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
tokens
,
unicode
)):
if
tokens
in
self
.
special_tokens
:
return
self
.
special_tokens
[
tokens
]
else
:
return
self
.
encoder
.
get
(
tokens
,
0
)
for
token
in
tokens
:
if
token
in
self
.
special_tokens
:
ids
.
append
(
self
.
special_tokens
[
token
])
else
:
ids
.
append
(
self
.
encoder
.
get
(
token
,
0
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT model ({} > {}). Running this"
" sequence through the model will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts a sequence of ids in BPE tokens using the vocab."""
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
tokens
=
[]
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
for
i
in
ids
:
if
i
in
self
.
special_tokens_decoder
:
if
not
skip_special_tokens
:
tokens
.
append
(
self
.
special_tokens_decoder
[
i
])
else
:
tokens
.
append
(
self
.
decoder
[
i
])
return
tokens
def
encode
(
self
,
text
):
def
_convert_ids_to_string
(
self
,
tokens_ids
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
decode
(
self
,
ids
,
skip_special_tokens
=
False
,
clean_up_tokenization_spaces
=
True
):
"""Converts a sequence of ids in a string."""
"""Converts a sequence of ids in a string."""
tokens
=
self
.
convert_ids_to_tokens
(
ids
,
skip_special_tokens
=
skip_special_tokens
)
out_string
=
''
.
join
(
tokens_ids
).
replace
(
'</w>'
,
' '
).
strip
()
out_string
=
''
.
join
(
tokens
).
replace
(
'</w>'
,
' '
).
strip
()
if
clean_up_tokenization_spaces
:
out_string
=
out_string
.
replace
(
'<unk>'
,
''
)
out_string
=
clean_up_tokenization
(
out_string
)
return
out_string
return
out_string
def
save_vocabulary
(
self
,
vocab_path
):
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
"""Save the tokenizer vocabulary and merge files to a directory."""
if
not
os
.
path
.
isdir
(
vocab_path
):
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
vocab_path
))
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
return
vocab_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_NAME
)
vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
vocab_path
,
MERGES_NAME
)
merge_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'merges_file'
])
special_tokens_file
=
os
.
path
.
join
(
vocab_path
,
SPECIAL_TOKENS_NAME
)
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
...
@@ -311,14 +209,4 @@ class XLMTokenizer(object):
...
@@ -311,14 +209,4 @@ class XLMTokenizer(object):
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
index
+=
1
index
=
len
(
self
.
encoder
)
return
vocab_file
,
merge_file
with
open
(
special_tokens_file
,
'w'
,
encoding
=
'utf-8'
)
as
writer
:
for
token
,
token_index
in
sorted
(
self
.
special_tokens
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
special_tokens_file
))
index
=
token_index
writer
.
write
(
token
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
,
special_tokens_file
pytorch_
pretrained_b
er
t
/tokenization_xlnet.py
→
pytorch_
transform
er
s
/tokenization_xlnet.py
View file @
e468192e
...
@@ -16,26 +16,29 @@
...
@@ -16,26 +16,29 @@
from
__future__
import
(
absolute_import
,
division
,
print_function
,
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
unicode_literals
)
import
json
import
logging
import
logging
import
os
import
os
import
sys
from
shutil
import
copyfile
from
shutil
import
copyfile
from
io
import
open
import
unicodedata
import
unicodedata
import
six
import
six
from
.file_utils
import
cached_path
from
.tokenization_utils
import
PreTrainedTokenizer
,
clean_up_tokenization
from
.model_utils
import
clean_up_tokenization
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_VOCAB_ARCHIVE_MAP
=
{
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'spiece.model'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'xlnet-large-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model"
,
'xlnet-large-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'xlnet-large-cased'
:
512
,
}
}
VOCAB_NAME
=
'spiece.model'
SPECIAL_TOKENS_NAME
=
'special_tokens.txt'
SPIECE_UNDERLINE
=
u
'▁'
SPIECE_UNDERLINE
=
u
'▁'
...
@@ -46,89 +49,31 @@ SEG_ID_CLS = 2
...
@@ -46,89 +49,31 @@ SEG_ID_CLS = 2
SEG_ID_SEP
=
3
SEG_ID_SEP
=
3
SEG_ID_PAD
=
4
SEG_ID_PAD
=
4
class
XLNetTokenizer
(
object
):
class
XLNetTokenizer
(
PreTrainedTokenizer
):
"""
"""
SentencePiece based tokenizer. Peculiarities:
SentencePiece based tokenizer. Peculiarities:
- requires SentencePiece: https://github.com/google/sentencepiece
- requires SentencePiece: https://github.com/google/sentencepiece
"""
"""
# Tokens
vocab_files_names
=
VOCAB_FILES_NAMES
special_symbols
=
{
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
"<unk>"
:
0
,
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
"<s>"
:
1
,
"</s>"
:
2
,
def
__init__
(
self
,
vocab_file
,
max_len
=
None
,
"<cls>"
:
3
,
do_lower_case
=
False
,
remove_space
=
True
,
keep_accents
=
False
,
"<sep>"
:
4
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
unk_token
=
"<unk>"
,
sep_token
=
"<sep>"
,
"<pad>"
:
5
,
pad_token
=
"<pad>"
,
cls_token
=
"<cls>"
,
mask_token
=
"<mask>"
,
"<mask>"
:
6
,
additional_special_tokens
=
[
"<eop>"
,
"<eod>"
],
**
kwargs
):
"<eod>"
:
7
,
super
(
XLNetTokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
"<eop>"
:
8
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
}
pad_token
=
pad_token
,
cls_token
=
cls_token
,
@
classmethod
mask_token
=
mask_token
,
additional_special_tokens
=
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
additional_special_tokens
,
**
kwargs
)
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
vocab_file
=
PRETRAINED_VOCAB_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
special_tokens_file
=
None
if
'-cased'
in
pretrained_model_name_or_path
and
kwargs
.
get
(
'do_lower_case'
,
True
):
logger
.
warning
(
"The pre-trained model you are loading is a cased model but you have not set "
"`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
"you may want to check this behavior."
)
kwargs
[
'do_lower_case'
]
=
False
elif
'-cased'
not
in
pretrained_model_name_or_path
and
not
kwargs
.
get
(
'do_lower_case'
,
True
):
logger
.
warning
(
"The pre-trained model you are loading is an uncased model but you have set "
"`do_lower_case` to False. We are setting `do_lower_case=True` for you "
"but you may want to check this behavior."
)
kwargs
[
'do_lower_case'
]
=
True
else
:
vocab_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
VOCAB_NAME
)
special_tokens_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
SPECIAL_TOKENS_NAME
)
if
not
os
.
path
.
exists
(
special_tokens_file
):
special_tokens_file
=
None
else
:
logger
.
info
(
"loading special tokens file {}"
.
format
(
special_tokens_file
))
# redirect to the cache, if necessary
try
:
resolved_vocab_file
=
cached_path
(
vocab_file
,
cache_dir
=
cache_dir
)
except
EnvironmentError
:
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
logger
.
error
(
"Couldn't reach server at '{}' to download vocabulary."
.
format
(
vocab_file
))
else
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {}"
"at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
()),
pretrained_model_name_or_path
,
vocab_file
))
return
None
if
resolved_vocab_file
==
vocab_file
:
logger
.
info
(
"loading vocabulary file {}"
.
format
(
vocab_file
))
else
:
logger
.
info
(
"loading vocabulary file {} from cache at {}"
.
format
(
vocab_file
,
resolved_vocab_file
))
# Instantiate tokenizer.
if
special_tokens_file
and
'special_tokens'
not
in
kwargs
:
special_tokens
=
open
(
special_tokens_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
else
:
special_tokens
=
kwargs
.
pop
(
'special_tokens'
,
[])
tokenizer
=
cls
(
resolved_vocab_file
,
special_tokens
=
special_tokens
,
*
inputs
,
**
kwargs
)
return
tokenizer
def
__init__
(
self
,
vocab_file
,
special_tokens
=
None
,
max_len
=
None
,
do_lower_case
=
False
,
remove_space
=
True
,
keep_accents
=
False
):
try
:
try
:
import
sentencepiece
as
spm
import
sentencepiece
as
spm
except
ImportError
:
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
logger
.
warning
(
"You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
"pip install sentencepiece"
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
do_lower_case
=
do_lower_case
self
.
do_lower_case
=
do_lower_case
self
.
remove_space
=
remove_space
self
.
remove_space
=
remove_space
self
.
keep_accents
=
keep_accents
self
.
keep_accents
=
keep_accents
...
@@ -136,52 +81,10 @@ class XLNetTokenizer(object):
...
@@ -136,52 +81,10 @@ class XLNetTokenizer(object):
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
vocab_file
)
self
.
sp_model
.
Load
(
vocab_file
)
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
self
.
set_special_tokens
(
special_tokens
)
@
property
def
UNK_TOKEN
(
self
):
return
"<unk>"
@
property
def
SEP_TOKEN
(
self
):
return
"<sep>"
@
property
def
PAD_TOKEN
(
self
):
return
"<pad>"
@
property
def
CLS_TOKEN
(
self
):
return
"<cls>"
@
property
def
MASK_TOKEN
(
self
):
return
"<mask>"
@
property
@
property
def
UNK_ID
(
self
):
def
vocab_size
(
self
):
return
self
.
special_symbols
[
"<unk>"
]
return
len
(
self
.
sp_model
)
@
property
def
SEP_ID
(
self
):
return
self
.
special_symbols
[
"<sep>"
]
@
property
def
PAD_ID
(
self
):
return
self
.
special_symbols
[
"<pad>"
]
@
property
def
CLS_ID
(
self
):
return
self
.
special_symbols
[
"<cls>"
]
@
property
def
MASK_ID
(
self
):
return
self
.
special_symbols
[
"<mask>"
]
def
__len__
(
self
):
return
len
(
self
.
encoder
)
+
len
(
self
.
special_tokens
)
def
__getstate__
(
self
):
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
=
self
.
__dict__
.
copy
()
...
@@ -198,19 +101,6 @@ class XLNetTokenizer(object):
...
@@ -198,19 +101,6 @@ class XLNetTokenizer(object):
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
set_special_tokens
(
self
,
special_tokens
):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if
not
special_tokens
:
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
return
self
.
special_tokens
=
dict
((
tok
,
len
(
self
.
sp_model
)
+
i
)
for
i
,
tok
in
enumerate
(
special_tokens
))
self
.
special_tokens_decoder
=
{
v
:
k
for
k
,
v
in
self
.
special_tokens
.
items
()}
logger
.
info
(
"Special tokens: %s"
,
str
(
self
.
special_tokens
))
def
preprocess_text
(
self
,
inputs
):
def
preprocess_text
(
self
,
inputs
):
if
self
.
remove_space
:
if
self
.
remove_space
:
outputs
=
' '
.
join
(
inputs
.
strip
().
split
())
outputs
=
' '
.
join
(
inputs
.
strip
().
split
())
...
@@ -229,7 +119,7 @@ class XLNetTokenizer(object):
...
@@ -229,7 +119,7 @@ class XLNetTokenizer(object):
return
outputs
return
outputs
def
tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
def
_
tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
""" Tokenize a string.
""" Tokenize a string.
return_unicode is used only for py2
return_unicode is used only for py2
"""
"""
...
@@ -268,78 +158,31 @@ class XLNetTokenizer(object):
...
@@ -268,78 +158,31 @@ class XLNetTokenizer(object):
return
new_pieces
return
new_pieces
def
convert_tokens_to_ids
(
self
,
tokens
,
sample
=
False
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a sequence of tokens into ids using the vocab. """
""" Converts a token (str/unicode) in an id using the vocab. """
ids
=
[]
return
self
.
sp_model
.
PieceToId
(
token
)
if
isinstance
(
tokens
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
tokens
,
unicode
)):
if
tokens
in
self
.
special_tokens
:
return
self
.
special_tokens
[
tokens
]
else
:
return
self
.
sp_model
.
PieceToId
(
tokens
)
for
token
in
tokens
:
if
token
in
self
.
special_tokens
:
ids
.
append
(
self
.
special_tokens
[
token
])
else
:
ids
.
append
(
self
.
sp_model
.
PieceToId
(
token
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this XLNet model ({} > {}). Running this"
" sequence through the model will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
,
return_unicode
=
True
,
skip_special_tokens
=
False
):
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts a sequence of ids in tokens."""
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
tokens
=
[]
token
=
self
.
sp_model
.
IdToPiece
(
index
)
for
i
in
ids
:
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
if
i
in
self
.
special_tokens_decoder
:
token
=
token
.
decode
(
'utf-8'
)
if
not
skip_special_tokens
:
return
token
tokens
.
append
(
self
.
special_tokens_decoder
[
i
])
else
:
tokens
.
append
(
self
.
sp_model
.
IdToPiece
(
i
))
if
six
.
PY2
and
return_unicode
:
def
_convert_ids_to_string
(
self
,
tokens_ids
):
ret_pieces
=
[]
for
piece
in
tokens
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
'utf-8'
)
ret_pieces
.
append
(
piece
)
tokens
=
ret_pieces
return
tokens
def
encode
(
self
,
text
,
sample
=
False
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
sample
=
sample
))
def
decode
(
self
,
ids
,
skip_special_tokens
=
False
,
clean_up_tokenization_spaces
=
True
):
"""Converts a sequence of ids in a string."""
"""Converts a sequence of ids in a string."""
tokens
=
self
.
convert_ids_to_tokens
(
ids
,
skip_special_tokens
=
skip_special_tokens
)
out_string
=
''
.
join
(
tokens_ids
)
out_string
=
''
.
join
(
tokens
)
if
clean_up_tokenization_spaces
:
out_string
=
out_string
.
strip
().
replace
(
'<unk>'
,
''
)
out_string
=
clean_up_tokenization
(
out_string
)
return
out_string
return
out_string
def
save_vocabulary
(
self
,
vocab_path
):
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
to a directory.
"""
"""
if
not
os
.
path
.
isdir
(
vocab_path
):
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
vocab_path
))
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
return
out_vocab_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_NAME
)
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
special_tokens_file
=
os
.
path
.
join
(
vocab_path
,
SPECIAL_TOKENS_NAME
)
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
index
=
len
(
self
.
sp_model
)
return
(
out_vocab_file
,)
with
open
(
special_tokens_file
,
'w'
,
encoding
=
'utf-8'
)
as
writer
:
for
token
,
token_index
in
sorted
(
self
.
special_tokens
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
special_tokens_file
))
index
=
token_index
writer
.
write
(
token
+
u
'
\n
'
)
index
+=
1
return
out_vocab_file
,
special_tokens_file
setup.py
View file @
e468192e
...
@@ -37,16 +37,16 @@ from io import open
...
@@ -37,16 +37,16 @@ from io import open
from
setuptools
import
find_packages
,
setup
from
setuptools
import
find_packages
,
setup
setup
(
setup
(
name
=
"pytorch_
pretrained_b
er
t
"
,
name
=
"pytorch_
transform
er
s
"
,
version
=
"0.
6.2
"
,
version
=
"0.
7.0
"
,
author
=
"Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors"
,
author
=
"Thomas Wolf,
Lysandre Debut,
Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors"
,
author_email
=
"thomas@huggingface.co"
,
author_email
=
"thomas@huggingface.co"
,
description
=
"
PyTorch version of Google AI BERT model with script to load Google pre-trained models
"
,
description
=
"
Repository of pre-trained NLP Transformer models: BERT, GPT & GPT-2, Transformer-XL, XLNet and XLM
"
,
long_description
=
open
(
"README.md"
,
"r"
,
encoding
=
'utf-8'
).
read
(),
long_description
=
open
(
"README.md"
,
"r"
,
encoding
=
'utf-8'
).
read
(),
long_description_content_type
=
"text/markdown"
,
long_description_content_type
=
"text/markdown"
,
keywords
=
'
BERT
NLP deep learning
google
'
,
keywords
=
'NLP deep learning
transformer pytorch BERT GPT GPT-2 google openai CMU
'
,
license
=
'Apache'
,
license
=
'Apache'
,
url
=
"https://github.com/huggingface/pytorch-
pretrained-BERT
"
,
url
=
"https://github.com/huggingface/pytorch-
transformers
"
,
packages
=
find_packages
(
exclude
=
[
"*.tests"
,
"*.tests.*"
,
packages
=
find_packages
(
exclude
=
[
"*.tests"
,
"*.tests.*"
,
"tests.*"
,
"tests"
]),
"tests.*"
,
"tests"
]),
install_requires
=
[
'torch>=0.4.1'
,
install_requires
=
[
'torch>=0.4.1'
,
...
@@ -58,7 +58,7 @@ setup(
...
@@ -58,7 +58,7 @@ setup(
'sentencepiece'
],
'sentencepiece'
],
entry_points
=
{
entry_points
=
{
'console_scripts'
:
[
'console_scripts'
:
[
"pytorch_
pretrained_b
er
t
=pytorch_
pretrained_b
er
t
.__main__:main"
,
"pytorch_
transform
er
s
=pytorch_
transform
er
s
.__main__:main"
,
]
]
},
},
# python_requires='>=3.5.0',
# python_requires='>=3.5.0',
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment