Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
9d8fd2d4
Commit
9d8fd2d4
authored
Jan 15, 2020
by
Julien Chaumond
Browse files
tokenizer.save_pretrained: only save file if non-empty
parent
6e2c28a1
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6 additions
and
8 deletions
+6
-8
src/transformers/configuration_auto.py
src/transformers/configuration_auto.py
+1
-1
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+3
-5
tests/test_tokenization_auto.py
tests/test_tokenization_auto.py
+2
-2
No files found.
src/transformers/configuration_auto.py
View file @
9d8fd2d4
...
...
@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Auto
Model
class. """
""" Auto
Config
class. """
import
logging
...
...
src/transformers/tokenization_utils.py
View file @
9d8fd2d4
...
...
@@ -513,11 +513,9 @@ class PreTrainedTokenizer(object):
with
open
(
special_tokens_map_file
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
special_tokens_map
,
ensure_ascii
=
False
))
if
len
(
self
.
added_tokens_encoder
)
>
0
:
with
open
(
added_tokens_file
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
if
self
.
added_tokens_encoder
:
out_str
=
json
.
dumps
(
self
.
added_tokens_encoder
,
ensure_ascii
=
False
)
else
:
out_str
=
"{}"
f
.
write
(
out_str
)
vocab_files
=
self
.
save_vocabulary
(
save_directory
)
...
...
tests/test_tokenization_auto.py
View file @
9d8fd2d4
...
...
@@ -33,13 +33,13 @@ class AutoTokenizerTest(unittest.TestCase):
# @slow
def
test_tokenizer_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
[
x
for
x
in
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
()
if
"japanese"
not
in
x
]
:
for
model_name
in
(
x
for
x
in
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
()
if
"japanese"
not
in
x
)
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
self
.
assertIsNotNone
(
tokenizer
)
self
.
assertIsInstance
(
tokenizer
,
BertTokenizer
)
self
.
assertGreater
(
len
(
tokenizer
),
0
)
for
model_name
in
list
(
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
()
)[:
1
]
:
for
model_name
in
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
():
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
self
.
assertIsNotNone
(
tokenizer
)
self
.
assertIsInstance
(
tokenizer
,
GPT2Tokenizer
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment