Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
24831477
Commit
24831477
authored
Oct 08, 2019
by
thomwolf
Browse files
fix tokenization
parent
03c2c762
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
2 additions
and
2 deletions
+2
-2
transformers/tests/tokenization_ctrl_test.py
transformers/tests/tokenization_ctrl_test.py
+1
-1
transformers/tokenization_ctrl.py
transformers/tokenization_ctrl.py
+1
-1
No files found.
transformers/tests/tokenization_ctrl_test.py
View file @
24831477
...
@@ -55,7 +55,7 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -55,7 +55,7 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer
=
CTRLTokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
tokenizer
=
CTRLTokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
text
=
"adapt react readapt apt"
text
=
"adapt react readapt apt"
bpe_tokens
=
'adapt re@@ a@@ c@@ t re@@ adapt apt'
.
split
()
bpe_tokens
=
'adapt re@@ a@@ c@@ t re@@ adapt apt'
.
split
()
tokens
=
tokenizer
.
tokenize
(
text
,
add_prefix_space
=
True
)
tokens
=
tokenizer
.
tokenize
(
text
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
...
...
transformers/tokenization_ctrl.py
View file @
24831477
...
@@ -205,7 +205,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
...
@@ -205,7 +205,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
""" Converts a sequence of tokens (string) in a single string. """
""" Converts a sequence of tokens (string) in a single string. """
out_string
=
''
.
join
(
tokens
).
replace
(
'@@'
,
'
'
).
strip
()
out_string
=
'
'
.
join
(
tokens
).
replace
(
'@@
'
,
''
).
strip
()
return
out_string
return
out_string
def
save_vocabulary
(
self
,
save_directory
):
def
save_vocabulary
(
self
,
save_directory
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment