Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg

Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
a14b055b · Albert Villanova del Moral · GitHub · 485fd814 · a14b055b · a14b055b
Unverified Commit a14b055b authored Jun 17, 2024 by Albert Villanova del Moral Committed by GitHub Jun 17, 2024
8 changed files
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -567,7 +567,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):

        image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)

-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        file = image[0]["file"]
        outputs = image_segmenter(file, threshold=threshold)

@@ -621,7 +621,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
    def test_oneformer(self):
        image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")

-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        file = image[0]["file"]
        outputs = image_segmenter(file, threshold=0.99)
        # Shortening by hashing

--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -178,7 +178,7 @@ class GgufIntegrationTests(unittest.TestCase):
        gguf_tokenizer = AutoTokenizer.from_pretrained(self.model_id, gguf_file=self.q8_0_gguf_model_id)
        original_tokenizer = AutoTokenizer.from_pretrained(self.original_model_id)

-        dataset = load_dataset("code_x_glue_ct_code_to_text", "go")
+        dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
        for item in tqdm.tqdm(dataset["validation"]):
            string = item["code"]
            encoded1 = gguf_tokenizer.encode(string)
@@ -191,7 +191,7 @@ class GgufIntegrationTests(unittest.TestCase):

            self.assertEqual(decoded1, decoded2)

-        dataset = load_dataset("xnli", "all_languages")
+        dataset = load_dataset("facebook/xnli", "all_languages")

        for i, item in enumerate(tqdm.tqdm(dataset["train"].select(range(100)))):
            for string in item["premise"].values():

--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -253,7 +253,7 @@ def main():
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset("glue", data_args.task_name)
+        datasets = load_dataset("nyu-mll/glue", data_args.task_name)
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.

--- a/tests/sagemaker/scripts/tensorflow/run_tf.py
+++ b/tests/sagemaker/scripts/tensorflow/run_tf.py
@@ -56,7 +56,7 @@ if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    # Load dataset
-    train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
+    train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"])
    train_dataset = train_dataset.shuffle().select(range(5000))  # smaller the size for train dataset to 5k
    test_dataset = test_dataset.shuffle().select(range(500))  # smaller the size for test dataset to 500


--- a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
+++ b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
@@ -50,7 +50,7 @@ def fit(model, loss, opt, train_dataset, epochs, train_batch_size, max_steps=Non

 def get_datasets(tokenizer, train_batch_size, eval_batch_size):
    # Load dataset
-    train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
+    train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"])

    # Preprocess train dataset
    train_dataset = train_dataset.map(

--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -43,8 +43,8 @@ class Seq2seqTrainerTester(TestCasePlus):
        bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
        bert2bert.config.max_length = 128

-        train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
-        val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")
+        train_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0", split="train[:1%]")
+        val_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0", split="validation[:1%]")

        train_dataset = train_dataset.select(range(32))
        val_dataset = val_dataset.select(range(16))
@@ -145,7 +145,7 @@ class Seq2seqTrainerTester(TestCasePlus):
        MAX_INPUT_LENGTH = 256
        MAX_TARGET_LENGTH = 256

-        dataset = datasets.load_dataset("gsm8k", "main", split="train[:38]")
+        dataset = datasets.load_dataset("openai/gsm8k", "main", split="train[:38]")
        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")

--- a/tests/utils/test_audio_utils.py
+++ b/tests/utils/test_audio_utils.py
@@ -259,7 +259,9 @@ class AudioUtilsFunctionTester(unittest.TestCase):
    def _load_datasamples(self, num_samples):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
        return [x["array"] for x in speech_samples]


--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -1080,7 +1080,7 @@ def build(config_class, models_to_create, output_dir):
            it. Models in different frameworks with the same architecture will be saved in the same subdirectory.
    """
    if data["training_ds"] is None or data["testing_ds"] is None:
-        ds = load_dataset("wikitext", "wikitext-2-raw-v1")
+        ds = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")
        data["training_ds"] = ds["train"]
        data["testing_ds"] = ds["test"]