Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
177a7212
Commit
177a7212
authored
Oct 10, 2019
by
thomwolf
Browse files
move back to simple space spliting
parent
a5997dd8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
19 deletions
+3
-19
examples/run_generation.py
examples/run_generation.py
+1
-1
transformers/tokenization_ctrl.py
transformers/tokenization_ctrl.py
+2
-18
No files found.
examples/run_generation.py
View file @
177a7212
...
...
@@ -194,7 +194,7 @@ def main():
elif
args
.
length
<
0
:
args
.
length
=
MAX_LENGTH
# avoid infinite loop
print
(
args
)
logger
.
info
(
args
)
if
args
.
model_type
in
[
"ctrl"
]:
if
args
.
temperature
>
0.7
:
logger
.
info
(
'CTRL typically works better with lower temperatures (and lower top_k).'
)
...
...
transformers/tokenization_ctrl.py
View file @
177a7212
...
...
@@ -22,9 +22,6 @@ import os
import
regex
as
re
from
io
import
open
import
sacremoses
as
sm
from
.tokenization_xlm
import
replace_unicode_punct
,
remove_non_printing_char
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -81,9 +78,6 @@ class CTRLTokenizer(PreTrainedTokenizer):
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
punct_normalizer
=
sm
.
MosesPunctNormalizer
(
lang
=
'en'
)
self
.
moses_tokenizer
=
sm
.
MosesTokenizer
(
lang
=
'en'
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
...
...
@@ -138,22 +132,12 @@ class CTRLTokenizer(PreTrainedTokenizer):
self
.
cache
[
token
]
=
word
return
word
def
moses_pipeline
(
self
,
text
):
text
=
replace_unicode_punct
(
text
)
text
=
self
.
punct_normalizer
.
normalize
(
text
)
text
=
remove_non_printing_char
(
text
)
return
text
def
_tokenize
(
self
,
text
,
bypass_tokenizer
=
False
):
def
_tokenize
(
self
,
text
):
""" Tokenize a string.
"""
split_tokens
=
[]
if
bypass_tokenizer
:
text
=
text
.
split
()
else
:
text
=
self
.
moses_pipeline
(
text
)
text
=
self
.
moses_tokenizer
.
tokenize
(
text
,
return_str
=
False
,
escape
=
False
)
text
=
text
.
split
(
' '
)
for
token
in
text
:
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
).
split
(
' '
)])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment