Unverified Commit f394a2a5 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Json configs] Make json prettier for all saved tokenizer files & ensure same...

[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)

* [Json dump] Make json prettier

* correct more tokenizeirs

* more patterns

* add aggressive test

* the aggressive test was actually useful :-)

* more tests

* Apply suggestions from code review
parent 6ee1474b
......@@ -25,7 +25,7 @@ from pathlib import Path
from huggingface_hub import Repository, delete_repo, login
from requests.exceptions import HTTPError
from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
from transformers.testing_utils import PASS, USER, get_tests_dir, is_staging_test
from transformers.testing_utils import PASS, USER, check_json_file_has_correct_format, get_tests_dir, is_staging_test
from transformers.utils import is_torch_available, is_vision_available
......@@ -107,7 +107,8 @@ class FeatureExtractionSavingTestMixin:
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
feat_extract_first.save_pretrained(tmpdirname)
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
......
......@@ -51,6 +51,7 @@ from transformers import (
from transformers.testing_utils import (
PASS,
USER,
check_json_file_has_correct_format,
get_tests_dir,
is_pt_tf_cross_test,
is_staging_test,
......@@ -3325,6 +3326,11 @@ class TokenizerTesterMixin:
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# make sure that all ".json" files are saved in the correct format
for file_path in tokenizer_r_files + tokenizer_p_files:
if os.path.exists(file_path) and file_path.endswith(".json"):
check_json_file_has_correct_format(file_path)
# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment