Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
177a7212
Commit
177a7212
authored
Oct 10, 2019
by
thomwolf
Browse files
move back to simple space spliting
parent
a5997dd8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
19 deletions
+3
-19
examples/run_generation.py
examples/run_generation.py
+1
-1
transformers/tokenization_ctrl.py
transformers/tokenization_ctrl.py
+2
-18
No files found.
examples/run_generation.py
View file @
177a7212
...
...
@@ -194,7 +194,7 @@ def main():
elif
args
.
length
<
0
:
args
.
length
=
MAX_LENGTH
# avoid infinite loop
print
(
args
)
logger
.
info
(
args
)
if
args
.
model_type
in
[
"ctrl"
]:
if
args
.
temperature
>
0.7
:
logger
.
info
(
'CTRL typically works better with lower temperatures (and lower top_k).'
)
...
...
transformers/tokenization_ctrl.py
View file @
177a7212
...
...
@@ -22,9 +22,6 @@ import os
import
regex
as
re
from
io
import
open
import
sacremoses
as
sm
from
.tokenization_xlm
import
replace_unicode_punct
,
remove_non_printing_char
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -81,9 +78,6 @@ class CTRLTokenizer(PreTrainedTokenizer):
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
punct_normalizer
=
sm
.
MosesPunctNormalizer
(
lang
=
'en'
)
self
.
moses_tokenizer
=
sm
.
MosesTokenizer
(
lang
=
'en'
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
...
...
@@ -138,22 +132,12 @@ class CTRLTokenizer(PreTrainedTokenizer):
self
.
cache
[
token
]
=
word
return
word
def
moses_pipeline
(
self
,
text
):
text
=
replace_unicode_punct
(
text
)
text
=
self
.
punct_normalizer
.
normalize
(
text
)
text
=
remove_non_printing_char
(
text
)
return
text
def
_tokenize
(
self
,
text
,
bypass_tokenizer
=
False
):
def
_tokenize
(
self
,
text
):
""" Tokenize a string.
"""
split_tokens
=
[]
if
bypass_tokenizer
:
text
=
text
.
split
()
else
:
text
=
self
.
moses_pipeline
(
text
)
text
=
self
.
moses_tokenizer
.
tokenize
(
text
,
return_str
=
False
,
escape
=
False
)
text
=
text
.
split
(
' '
)
for
token
in
text
:
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
).
split
(
' '
)])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment