Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
0517e7a1
Commit
0517e7a1
authored
Aug 30, 2019
by
thomwolf
Browse files
Fix GPT2 and RoBERTa tokenizer to beging with a space - update Roberta tokenizer
parent
55f69a11
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
116 deletions
+9
-116
pytorch_transformers/modeling_gpt2.py
pytorch_transformers/modeling_gpt2.py
+1
-1
pytorch_transformers/tokenization_gpt2.py
pytorch_transformers/tokenization_gpt2.py
+6
-5
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+2
-110
No files found.
pytorch_transformers/modeling_gpt2.py
View file @
0517e7a1
...
...
@@ -682,7 +682,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
model.resize_token_embeddings(tokenizer
.vocab_size + 1
)
model.resize_token_embeddings(
len(
tokenizer
)) # Update the model embeddings to the new vocabulary size (add a vector at the end
)
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
...
...
pytorch_transformers/tokenization_gpt2.py
View file @
0517e7a1
...
...
@@ -109,11 +109,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
bos_token
=
"<|endoftext|>"
,
eos_token
=
"<|endoftext|>"
,
**
kwargs
):
super
(
GPT2Tokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
errors
=
errors
# how to handle errors in decoding
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
errors
=
errors
# how to handle errors in decoding
self
.
byte_encoder
=
bytes_to_unicode
()
self
.
byte_decoder
=
{
v
:
k
for
k
,
v
in
self
.
byte_encoder
.
items
()}
self
.
byte_decoder
=
{
v
:
k
for
k
,
v
in
self
.
byte_encoder
.
items
()}
bpe_data
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
bpe_merges
=
[
tuple
(
merge
.
split
())
for
merge
in
bpe_data
]
self
.
bpe_ranks
=
dict
(
zip
(
bpe_merges
,
range
(
len
(
bpe_merges
))))
...
...
@@ -169,6 +169,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def
_tokenize
(
self
,
text
):
""" Tokenize a string. """
text
=
' '
+
text
# GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
bpe_tokens
=
[]
for
token
in
re
.
findall
(
self
.
pat
,
text
):
if
sys
.
version_info
[
0
]
==
2
:
...
...
@@ -214,4 +215,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
return
vocab_file
,
merge_file
\ No newline at end of file
pytorch_transformers/tokenization_roberta.py
View file @
0517e7a1
...
...
@@ -23,8 +23,7 @@ import os
import
regex
as
re
from
io
import
open
from
.tokenization_gpt2
import
bytes_to_unicode
,
get_pairs
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
try
:
from
functools
import
lru_cache
...
...
@@ -63,7 +62,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
}
class
RobertaTokenizer
(
PreTrained
Tokenizer
):
class
RobertaTokenizer
(
GPT2
Tokenizer
):
"""
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
"""
...
...
@@ -77,89 +76,6 @@ class RobertaTokenizer(PreTrainedTokenizer):
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
**
kwargs
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
errors
=
errors
# how to handle errors in decoding
self
.
byte_encoder
=
bytes_to_unicode
()
self
.
byte_decoder
=
{
v
:
k
for
k
,
v
in
self
.
byte_encoder
.
items
()}
bpe_data
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
bpe_merges
=
[
tuple
(
merge
.
split
())
for
merge
in
bpe_data
]
self
.
bpe_ranks
=
dict
(
zip
(
bpe_merges
,
range
(
len
(
bpe_merges
))))
self
.
cache
=
{}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self
.
pat
=
re
.
compile
(
r
"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
@
property
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
bpe
(
self
,
token
):
if
token
in
self
.
cache
:
return
self
.
cache
[
token
]
word
=
tuple
(
token
)
pairs
=
get_pairs
(
word
)
if
not
pairs
:
return
token
while
True
:
bigram
=
min
(
pairs
,
key
=
lambda
pair
:
self
.
bpe_ranks
.
get
(
pair
,
float
(
'inf'
)))
if
bigram
not
in
self
.
bpe_ranks
:
break
first
,
second
=
bigram
new_word
=
[]
i
=
0
while
i
<
len
(
word
):
try
:
j
=
word
.
index
(
first
,
i
)
new_word
.
extend
(
word
[
i
:
j
])
i
=
j
except
:
new_word
.
extend
(
word
[
i
:])
break
if
word
[
i
]
==
first
and
i
<
len
(
word
)
-
1
and
word
[
i
+
1
]
==
second
:
new_word
.
append
(
first
+
second
)
i
+=
2
else
:
new_word
.
append
(
word
[
i
])
i
+=
1
new_word
=
tuple
(
new_word
)
word
=
new_word
if
len
(
word
)
==
1
:
break
else
:
pairs
=
get_pairs
(
word
)
word
=
' '
.
join
(
word
)
self
.
cache
[
token
]
=
word
return
word
def
_tokenize
(
self
,
text
):
""" Tokenize a string. """
bpe_tokens
=
[]
for
token
in
re
.
findall
(
self
.
pat
,
text
):
if
sys
.
version_info
[
0
]
==
2
:
token
=
''
.
join
(
self
.
byte_encoder
[
ord
(
b
)]
for
b
in
token
)
else
:
token
=
''
.
join
(
self
.
byte_encoder
[
b
]
for
b
in
token
.
encode
(
'utf-8'
))
bpe_tokens
.
extend
(
bpe_token
for
bpe_token
in
self
.
bpe
(
token
).
split
(
' '
))
return
bpe_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
return
self
.
decoder
.
get
(
index
)
def
convert_tokens_to_string
(
self
,
tokens
):
""" Converts a sequence of tokens (string) in a single string. """
text
=
''
.
join
(
tokens
)
text
=
bytearray
([
self
.
byte_decoder
[
c
]
for
c
in
text
]).
decode
(
'utf-8'
,
errors
=
self
.
errors
)
return
text
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence for sequence classification tasks.
...
...
@@ -175,27 +91,3 @@ class RobertaTokenizer(PreTrainedTokenizer):
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
index
=
0
with
open
(
merge_file
,
"w"
,
encoding
=
"utf-8"
)
as
writer
:
writer
.
write
(
u
'#version: 0.2
\n
'
)
for
bpe_tokens
,
token_index
in
sorted
(
self
.
bpe_ranks
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
merge_file
))
index
=
token_index
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment