Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
0af53b1e
"test/config_test/git@developer.sourcefind.cn:OpenDAS/nni.git" did not exist on "611a45fccdb09195745471c17621ff7a0e080ee7"
Unverified
Commit
0af53b1e
authored
Oct 09, 2020
by
Sam Shleifer
Committed by
GitHub
Oct 09, 2020
Browse files
Delete extra test file (#7681)
parent
b0f05e0c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
50 deletions
+0
-50
test_tokenization_blenderbot.py
test_tokenization_blenderbot.py
+0
-50
No files found.
test_tokenization_blenderbot.py
deleted
100644 → 0
View file @
b0f05e0c
import
json
import
os
import
unittest
from
transformers.testing_utils
import
slow
from
transformers.tokenization_blenderbot
import
VOCAB_FILES_NAMES
,
BlenderbotTokenizer
,
BlenderbotSmallTokenizer
from
.test_tokenization_common
import
TokenizerTesterMixin
class
BlenderbotSmallTokenizerTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BlenderbotSmallTokenizer
def
setUp
(
self
):
super
().
setUp
()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab
=
[
"adapt"
,
"react"
,
"read@@"
,
"ap@@"
,
"t"
,
"__unk__"
,
"__start__"
,
"__end__"
,
"__null__"
]
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
merges
=
[
"#version: 0.2"
,
"a p"
,
"ap t</w>"
,
"r e"
,
"a d"
,
"ad apt</w>"
,
""
]
self
.
special_tokens_map
=
{
"bos_token"
:
"__start"
,
"eos_token"
:
"__end__"
,
"pad_token"
:
"__null__"
,
"unk_token"
:
"__unk__"
}
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
self
.
merges_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"merges_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
fp
:
fp
.
write
(
json
.
dumps
(
vocab_tokens
)
+
"
\n
"
)
with
open
(
self
.
merges_file
,
"w"
,
encoding
=
"utf-8"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
def
get_tokenizer
(
self
,
**
kwargs
):
kwargs
.
update
(
self
.
special_tokens_map
)
return
BlenderbotSmallTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
,
tokenizer
):
input_text
=
"adapt react readapt apt"
output_text
=
"adapt react readapt apt"
return
input_text
,
output_text
def
test_full_blenderbot_small_tokenizer
(
self
):
tokenizer
=
BlenderbotSmallTokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
text
=
"adapt react readapt apt"
bpe_tokens
=
[
'adapt'
,
'react'
,
'read@@'
,
'ap@@'
,
't'
,
'ap@@'
,
't'
]
tokens
=
tokenizer
.
tokenize
(
text
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
[
tokenizer
.
bos_token
]
+
tokens
+
[
tokenizer
.
eos_token
]
print
(
input_tokens
)
# input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6]
# self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment