Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
43a237f1
Commit
43a237f1
authored
Oct 10, 2019
by
thomwolf
Browse files
switching to moses tokenizer
parent
036483fa
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
19 additions
and
31 deletions
+19
-31
transformers/tokenization_ctrl.py
transformers/tokenization_ctrl.py
+19
-31
No files found.
transformers/tokenization_ctrl.py
View file @
43a237f1
...
@@ -22,8 +22,9 @@ import os
...
@@ -22,8 +22,9 @@ import os
import
regex
as
re
import
regex
as
re
from
io
import
open
from
io
import
open
from
.tokenization_bert
import
BasicTokenizer
import
sacremoses
as
sm
from
.tokenization_xlm
import
replace_unicode_punct
,
remove_non_printing_char
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -48,39 +49,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -48,39 +49,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ctrl'
:
256
,
'ctrl'
:
256
,
}
}
def
text_standardize
(
text
):
"""
fixes some issues the spacy tokenizer had on books corpus
also does some whitespace standardization
"""
text
=
text
.
replace
(
'—'
,
'-'
)
text
=
text
.
replace
(
'–'
,
'-'
)
text
=
text
.
replace
(
'―'
,
'-'
)
text
=
text
.
replace
(
'…'
,
'...'
)
text
=
text
.
replace
(
'´'
,
"'"
)
text
=
re
.
sub
(
r
'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)'''
,
r
' \1 '
,
text
)
text
=
re
.
sub
(
r
'\s*\n\s*'
,
'
\n
'
,
text
)
text
=
re
.
sub
(
r
'[^\S\n]+'
,
' '
,
text
)
return
text
.
strip
()
def
get_pairs
(
word
):
def
get_pairs
(
word
):
"""Return set of symbol pairs in a word.
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
"""
# pairs = []
# prev_char = word[0]
# for i, char in enumerate(word[1:]):
# #_i = i + 1
# #if word[_i+1:] == tuple('</w>'):
# # pairs.append((prev_char, char+'</w>'))
# # break
# #else:
# if True:
# pairs.append((prev_char, char))
# prev_char = char
pairs
=
set
()
pairs
=
set
()
prev_char
=
word
[
0
]
prev_char
=
word
[
0
]
for
char
in
word
[
1
:]:
for
char
in
word
[
1
:]:
...
@@ -108,6 +81,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
...
@@ -108,6 +81,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
punct_normalizer
=
sm
.
MosesPunctNormalizer
(
lang
=
'en'
)
self
.
moses_tokenizer
=
sm
.
MosesTokenizer
(
lang
=
'en'
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
...
@@ -162,11 +138,23 @@ class CTRLTokenizer(PreTrainedTokenizer):
...
@@ -162,11 +138,23 @@ class CTRLTokenizer(PreTrainedTokenizer):
self
.
cache
[
token
]
=
word
self
.
cache
[
token
]
=
word
return
word
return
word
def
_tokenize
(
self
,
text
):
def
moses_pipeline
(
self
,
text
):
text
=
replace_unicode_punct
(
text
)
text
=
self
.
punct_normalizer
.
normalize
(
text
)
text
=
remove_non_printing_char
(
text
)
return
text
def
_tokenize
(
self
,
text
,
bypass_tokenizer
=
False
):
""" Tokenize a string.
""" Tokenize a string.
"""
"""
split_tokens
=
[]
split_tokens
=
[]
text
=
text
.
split
(
' '
)
if
bypass_tokenizer
:
text
=
text
.
split
()
else
:
text
=
self
.
moses_pipeline
(
text
)
text
=
self
.
moses_tokenizer
.
tokenize
(
text
,
return_str
=
False
,
escape
=
False
)
for
token
in
text
:
for
token
in
text
:
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
).
split
(
' '
)])
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
).
split
(
' '
)])
return
split_tokens
return
split_tokens
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment