Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
0517e7a1
Commit
0517e7a1
authored
Aug 30, 2019
by
thomwolf
Browse files
Fix GPT2 and RoBERTa tokenizer to beging with a space - update Roberta tokenizer
parent
55f69a11
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
116 deletions
+9
-116
pytorch_transformers/modeling_gpt2.py
pytorch_transformers/modeling_gpt2.py
+1
-1
pytorch_transformers/tokenization_gpt2.py
pytorch_transformers/tokenization_gpt2.py
+6
-5
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+2
-110
No files found.
pytorch_transformers/modeling_gpt2.py
View file @
0517e7a1
...
@@ -682,7 +682,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
...
@@ -682,7 +682,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
model.resize_token_embeddings(tokenizer
.vocab_size + 1
)
model.resize_token_embeddings(
len(
tokenizer
)) # Update the model embeddings to the new vocabulary size (add a vector at the end
)
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
...
...
pytorch_transformers/tokenization_gpt2.py
View file @
0517e7a1
...
@@ -109,11 +109,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -109,11 +109,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
bos_token
=
"<|endoftext|>"
,
eos_token
=
"<|endoftext|>"
,
**
kwargs
):
bos_token
=
"<|endoftext|>"
,
eos_token
=
"<|endoftext|>"
,
**
kwargs
):
super
(
GPT2Tokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
)
super
(
GPT2Tokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
))
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
errors
=
errors
# how to handle errors in decoding
self
.
errors
=
errors
# how to handle errors in decoding
self
.
byte_encoder
=
bytes_to_unicode
()
self
.
byte_encoder
=
bytes_to_unicode
()
self
.
byte_decoder
=
{
v
:
k
for
k
,
v
in
self
.
byte_encoder
.
items
()}
self
.
byte_decoder
=
{
v
:
k
for
k
,
v
in
self
.
byte_encoder
.
items
()}
bpe_data
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
bpe_data
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
bpe_merges
=
[
tuple
(
merge
.
split
())
for
merge
in
bpe_data
]
bpe_merges
=
[
tuple
(
merge
.
split
())
for
merge
in
bpe_data
]
self
.
bpe_ranks
=
dict
(
zip
(
bpe_merges
,
range
(
len
(
bpe_merges
))))
self
.
bpe_ranks
=
dict
(
zip
(
bpe_merges
,
range
(
len
(
bpe_merges
))))
...
@@ -169,6 +169,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -169,6 +169,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def
_tokenize
(
self
,
text
):
def
_tokenize
(
self
,
text
):
""" Tokenize a string. """
""" Tokenize a string. """
text
=
' '
+
text
# GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
bpe_tokens
=
[]
bpe_tokens
=
[]
for
token
in
re
.
findall
(
self
.
pat
,
text
):
for
token
in
re
.
findall
(
self
.
pat
,
text
):
if
sys
.
version_info
[
0
]
==
2
:
if
sys
.
version_info
[
0
]
==
2
:
...
@@ -214,4 +215,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -214,4 +215,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
index
+=
1
return
vocab_file
,
merge_file
return
vocab_file
,
merge_file
\ No newline at end of file
pytorch_transformers/tokenization_roberta.py
View file @
0517e7a1
...
@@ -23,8 +23,7 @@ import os
...
@@ -23,8 +23,7 @@ import os
import
regex
as
re
import
regex
as
re
from
io
import
open
from
io
import
open
from
.tokenization_gpt2
import
bytes_to_unicode
,
get_pairs
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_utils
import
PreTrainedTokenizer
try
:
try
:
from
functools
import
lru_cache
from
functools
import
lru_cache
...
@@ -63,7 +62,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -63,7 +62,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
}
}
class
RobertaTokenizer
(
PreTrained
Tokenizer
):
class
RobertaTokenizer
(
GPT2
Tokenizer
):
"""
"""
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
"""
"""
...
@@ -77,89 +76,6 @@ class RobertaTokenizer(PreTrainedTokenizer):
...
@@ -77,89 +76,6 @@ class RobertaTokenizer(PreTrainedTokenizer):
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
**
kwargs
)
mask_token
=
mask_token
,
**
kwargs
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
errors
=
errors
# how to handle errors in decoding
self
.
byte_encoder
=
bytes_to_unicode
()
self
.
byte_decoder
=
{
v
:
k
for
k
,
v
in
self
.
byte_encoder
.
items
()}
bpe_data
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
bpe_merges
=
[
tuple
(
merge
.
split
())
for
merge
in
bpe_data
]
self
.
bpe_ranks
=
dict
(
zip
(
bpe_merges
,
range
(
len
(
bpe_merges
))))
self
.
cache
=
{}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self
.
pat
=
re
.
compile
(
r
"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
@
property
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
bpe
(
self
,
token
):
if
token
in
self
.
cache
:
return
self
.
cache
[
token
]
word
=
tuple
(
token
)
pairs
=
get_pairs
(
word
)
if
not
pairs
:
return
token
while
True
:
bigram
=
min
(
pairs
,
key
=
lambda
pair
:
self
.
bpe_ranks
.
get
(
pair
,
float
(
'inf'
)))
if
bigram
not
in
self
.
bpe_ranks
:
break
first
,
second
=
bigram
new_word
=
[]
i
=
0
while
i
<
len
(
word
):
try
:
j
=
word
.
index
(
first
,
i
)
new_word
.
extend
(
word
[
i
:
j
])
i
=
j
except
:
new_word
.
extend
(
word
[
i
:])
break
if
word
[
i
]
==
first
and
i
<
len
(
word
)
-
1
and
word
[
i
+
1
]
==
second
:
new_word
.
append
(
first
+
second
)
i
+=
2
else
:
new_word
.
append
(
word
[
i
])
i
+=
1
new_word
=
tuple
(
new_word
)
word
=
new_word
if
len
(
word
)
==
1
:
break
else
:
pairs
=
get_pairs
(
word
)
word
=
' '
.
join
(
word
)
self
.
cache
[
token
]
=
word
return
word
def
_tokenize
(
self
,
text
):
""" Tokenize a string. """
bpe_tokens
=
[]
for
token
in
re
.
findall
(
self
.
pat
,
text
):
if
sys
.
version_info
[
0
]
==
2
:
token
=
''
.
join
(
self
.
byte_encoder
[
ord
(
b
)]
for
b
in
token
)
else
:
token
=
''
.
join
(
self
.
byte_encoder
[
b
]
for
b
in
token
.
encode
(
'utf-8'
))
bpe_tokens
.
extend
(
bpe_token
for
bpe_token
in
self
.
bpe
(
token
).
split
(
' '
))
return
bpe_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
return
self
.
decoder
.
get
(
index
)
def
convert_tokens_to_string
(
self
,
tokens
):
""" Converts a sequence of tokens (string) in a single string. """
text
=
''
.
join
(
tokens
)
text
=
bytearray
([
self
.
byte_decoder
[
c
]
for
c
in
text
]).
decode
(
'utf-8'
,
errors
=
self
.
errors
)
return
text
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
"""
Adds special tokens to a sequence for sequence classification tasks.
Adds special tokens to a sequence for sequence classification tasks.
...
@@ -175,27 +91,3 @@ class RobertaTokenizer(PreTrainedTokenizer):
...
@@ -175,27 +91,3 @@ class RobertaTokenizer(PreTrainedTokenizer):
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
index
=
0
with
open
(
merge_file
,
"w"
,
encoding
=
"utf-8"
)
as
writer
:
writer
.
write
(
u
'#version: 0.2
\n
'
)
for
bpe_tokens
,
token_index
in
sorted
(
self
.
bpe_ranks
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
merge_file
))
index
=
token_index
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment