Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
43a237f1
Commit
43a237f1
authored
Oct 10, 2019
by
thomwolf
Browse files
switching to moses tokenizer
parent
036483fa
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
19 additions
and
31 deletions
+19
-31
transformers/tokenization_ctrl.py
transformers/tokenization_ctrl.py
+19
-31
No files found.
transformers/tokenization_ctrl.py
View file @
43a237f1
...
@@ -22,8 +22,9 @@ import os
...
@@ -22,8 +22,9 @@ import os
import
regex
as
re
import
regex
as
re
from
io
import
open
from
io
import
open
from
.tokenization_bert
import
BasicTokenizer
import
sacremoses
as
sm
from
.tokenization_xlm
import
replace_unicode_punct
,
remove_non_printing_char
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -48,39 +49,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -48,39 +49,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ctrl'
:
256
,
'ctrl'
:
256
,
}
}
def
text_standardize
(
text
):
"""
fixes some issues the spacy tokenizer had on books corpus
also does some whitespace standardization
"""
text
=
text
.
replace
(
'—'
,
'-'
)
text
=
text
.
replace
(
'–'
,
'-'
)
text
=
text
.
replace
(
'―'
,
'-'
)
text
=
text
.
replace
(
'…'
,
'...'
)
text
=
text
.
replace
(
'´'
,
"'"
)
text
=
re
.
sub
(
r
'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)'''
,
r
' \1 '
,
text
)
text
=
re
.
sub
(
r
'\s*\n\s*'
,
'
\n
'
,
text
)
text
=
re
.
sub
(
r
'[^\S\n]+'
,
' '
,
text
)
return
text
.
strip
()
def
get_pairs
(
word
):
def
get_pairs
(
word
):
"""Return set of symbol pairs in a word.
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
"""
# pairs = []
# prev_char = word[0]
# for i, char in enumerate(word[1:]):
# #_i = i + 1
# #if word[_i+1:] == tuple('</w>'):
# # pairs.append((prev_char, char+'</w>'))
# # break
# #else:
# if True:
# pairs.append((prev_char, char))
# prev_char = char
pairs
=
set
()
pairs
=
set
()
prev_char
=
word
[
0
]
prev_char
=
word
[
0
]
for
char
in
word
[
1
:]:
for
char
in
word
[
1
:]:
...
@@ -108,6 +81,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
...
@@ -108,6 +81,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
punct_normalizer
=
sm
.
MosesPunctNormalizer
(
lang
=
'en'
)
self
.
moses_tokenizer
=
sm
.
MosesTokenizer
(
lang
=
'en'
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
...
@@ -162,11 +138,23 @@ class CTRLTokenizer(PreTrainedTokenizer):
...
@@ -162,11 +138,23 @@ class CTRLTokenizer(PreTrainedTokenizer):
self
.
cache
[
token
]
=
word
self
.
cache
[
token
]
=
word
return
word
return
word
def
_tokenize
(
self
,
text
):
def
moses_pipeline
(
self
,
text
):
text
=
replace_unicode_punct
(
text
)
text
=
self
.
punct_normalizer
.
normalize
(
text
)
text
=
remove_non_printing_char
(
text
)
return
text
def
_tokenize
(
self
,
text
,
bypass_tokenizer
=
False
):
""" Tokenize a string.
""" Tokenize a string.
"""
"""
split_tokens
=
[]
split_tokens
=
[]
text
=
text
.
split
(
' '
)
if
bypass_tokenizer
:
text
=
text
.
split
()
else
:
text
=
self
.
moses_pipeline
(
text
)
text
=
self
.
moses_tokenizer
.
tokenize
(
text
,
return_str
=
False
,
escape
=
False
)
for
token
in
text
:
for
token
in
text
:
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
).
split
(
' '
)])
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
).
split
(
' '
)])
return
split_tokens
return
split_tokens
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment