"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "1f843991716bba3b73071f1a2d819b0aebc05375"
Unverified Commit f394a2a5 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Json configs] Make json prettier for all saved tokenizer files & ensure same...

[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)

* [Json dump] Make json prettier

* correct more tokenizeirs

* more patterns

* add aggressive test

* the aggressive test was actually useful :-)

* more tests

* Apply suggestions from code review
parent 6ee1474b
...@@ -25,7 +25,7 @@ from pathlib import Path ...@@ -25,7 +25,7 @@ from pathlib import Path
from huggingface_hub import Repository, delete_repo, login from huggingface_hub import Repository, delete_repo, login
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
from transformers.testing_utils import PASS, USER, get_tests_dir, is_staging_test from transformers.testing_utils import PASS, USER, check_json_file_has_correct_format, get_tests_dir, is_staging_test
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_torch_available, is_vision_available
...@@ -107,7 +107,8 @@ class FeatureExtractionSavingTestMixin: ...@@ -107,7 +107,8 @@ class FeatureExtractionSavingTestMixin:
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
feat_extract_first.save_pretrained(tmpdirname) saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict()) self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
......
...@@ -51,6 +51,7 @@ from transformers import ( ...@@ -51,6 +51,7 @@ from transformers import (
from transformers.testing_utils import ( from transformers.testing_utils import (
PASS, PASS,
USER, USER,
check_json_file_has_correct_format,
get_tests_dir, get_tests_dir,
is_pt_tf_cross_test, is_pt_tf_cross_test,
is_staging_test, is_staging_test,
...@@ -3325,6 +3326,11 @@ class TokenizerTesterMixin: ...@@ -3325,6 +3326,11 @@ class TokenizerTesterMixin:
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# make sure that all ".json" files are saved in the correct format
for file_path in tokenizer_r_files + tokenizer_p_files:
if os.path.exists(file_path) and file_path.endswith(".json"):
check_json_file_has_correct_format(file_path)
# Checks it save with the same files + the tokenizer.json file for the fast one # Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment