Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg

Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
a14b055b · Albert Villanova del Moral · GitHub · 485fd814 · a14b055b · a14b055b
Unverified Commit a14b055b authored Jun 17, 2024 by Albert Villanova del Moral Committed by GitHub Jun 17, 2024
20 changed files
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -112,9 +112,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -366,6 +366,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@@ -316,6 +316,7 @@ class ExamplesTests(TestCasePlus):
        testargs = f"""
            run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --model_name_or_path microsoft/resnet-18
            --do_train
            --do_eval

--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -88,9 +88,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -239,6 +239,7 @@ def main():
            data_args.dataset_name,
            data_args.dataset_config_name,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -106,9 +106,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -333,6 +333,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/scripts/check_tokenizers.py
+++ b/scripts/check_tokenizers.py
@@ -13,7 +13,7 @@ TOKENIZER_CLASSES = {
    name: (getattr(transformers, name), getattr(transformers, name + "Fast")) for name in SLOW_TO_FAST_CONVERTERS
 }
-dataset = datasets.load_dataset("xnli", split="test+validation")
+dataset = datasets.load_dataset("facebook/xnli", split="test+validation")  # no-script
 total = 0
 perfect = 0

--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,7 @@ _deps = [
    "codecarbon==1.2.0",
    "cookiecutter==1.7.3",
    "dataclasses",
-    "datasets!=2.5.0,<2.20.0",  # Temporary upper version
+    "datasets!=2.5.0",
    "decord==0.6.0",
    "deepspeed>=0.9.3",
    "diffusers",

--- a/src/transformers/agents/text_to_speech.py
+++ b/src/transformers/agents/text_to_speech.py
@@ -51,7 +51,9 @@ class TextToSpeechTool(PipelineTool):
            if not is_datasets_available():
                raise ImportError("Datasets needs to be installed if not passing speaker embeddings.")
-            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            embeddings_dataset = load_dataset(
+                "Matthijs/cmu-arctic-xvectors", split="validation", trust_remote_code=True
+            )
            speaker_embeddings = torch.tensor(embeddings_dataset[7305]["xvector"]).unsqueeze(0)
        return {"input_ids": inputs["input_ids"], "speaker_embeddings": speaker_embeddings}

--- a/src/transformers/commands/pt_to_tf.py
+++ b/src/transformers/commands/pt_to_tf.py
@@ -202,7 +202,9 @@ class PTtoTFCommand(BaseTransformersCLICommand):
        """
        def _get_audio_input():
-            ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+            ds = load_dataset(
+                "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+            )
            speech_samples = ds.sort("id").select(range(2))[:2]["audio"]
            raw_samples = [x["array"] for x in speech_samples]
            return raw_samples
@@ -234,7 +236,7 @@ class PTtoTFCommand(BaseTransformersCLICommand):
                }
            )
        if "pixel_values" in model_forward_signature:
-            sample_images = load_dataset("cifar10", "plain_text", split="test")[:2]["img"]
+            sample_images = load_dataset("uoft-cs/cifar10", "plain_text", split="test")[:2]["img"]  # no-script
            processor_inputs.update({"images": sample_images})
        if "input_features" in model_forward_signature:
            feature_extractor_signature = inspect.signature(processor.feature_extractor).parameters

--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -9,7 +9,7 @@ deps = {
    "codecarbon": "codecarbon==1.2.0",
    "cookiecutter": "cookiecutter==1.7.3",
    "dataclasses": "dataclasses",
-    "datasets": "datasets!=2.5.0,<2.20.0",
+    "datasets": "datasets!=2.5.0",
    "decord": "decord==0.6.0",
    "deepspeed": "deepspeed>=0.9.3",
    "diffusers": "diffusers",

--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1760,7 +1760,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
    >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means
@@ -1812,7 +1812,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
    >>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default.
@@ -1901,7 +1901,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
    >>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt")
    >>> input_features = inputs.input_features

--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -205,7 +205,8 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
    feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)
    if "speech-commands" in model_name:
-        dataset = load_dataset("speech_commands", "v0.02", split="validation")
+        # TODO: Convert dataset to Parquet
+        dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
        waveform = dataset[0]["audio"]["array"]
    else:
        filepath = hf_hub_download(

--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    # Check outputs on an image
    if is_semantic:
        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        image = Image.open(ds[0]["file"])
    else:
        image_processor = BeitImageProcessor(

--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -2409,7 +2409,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
        >>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base")
-        >>> squad_ds = load_dataset("squad_v2", split="train")  # doctest: +IGNORE_RESULT
+        >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train")  # doctest: +IGNORE_RESULT
        >>> # select random long article
        >>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"]
@@ -2711,7 +2711,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
        >>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
-        >>> squad_ds = load_dataset("squad_v2", split="train")  # doctest: +IGNORE_RESULT
+        >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train")  # doctest: +IGNORE_RESULT
        >>> LONG_ARTICLE = squad_ds[81514]["context"]
        >>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt")
@@ -3040,7 +3040,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
        >>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base")
-        >>> squad_ds = load_dataset("squad_v2", split="train")  # doctest: +IGNORE_RESULT
+        >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train")  # doctest: +IGNORE_RESULT
        >>> # select random article and question
        >>> LONG_ARTICLE = squad_ds[81514]["context"]

--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@@ -1681,7 +1681,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
        >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library)
        >>> text = "This is an example text."
-        >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
        >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
        >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
@@ -1754,7 +1754,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
        >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library)
        >>> text = "This is an example text."
-        >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
        >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
        >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()

--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint(
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
-    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
    input_audio = [x["array"] for x in ds[:4]["audio"]]
    inputs = processor(input_audio, return_tensors="pt", padding=True)

--- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
@@ -831,7 +831,7 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
        >>> model.config.decoder_start_token_id = tokenizer.bos_token_id
        >>> # pre-process inputs and labels
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
        >>> inputs = feature_extractor(
        ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
        ... )

--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -148,7 +148,7 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
    model.load_state_dict(new_state_dict)
    # verify results on scanned document
-    dataset = load_dataset("hf-internal-testing/example-documents")
+    dataset = load_dataset("hf-internal-testing/example-documents")  # no-script
    image = dataset["test"][0]["image"].convert("RGB")
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)

--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -1431,7 +1431,7 @@ class HubertModel(HubertPreTrainedModel):
        ...     return batch
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
        >>> ds = ds.map(map_to_array)
        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1

--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1471,7 +1471,7 @@ class TFHubertModel(TFHubertPreTrainedModel):
        ...     return batch
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
        >>> ds = ds.map(map_to_array)
        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
@@ -1583,7 +1583,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
        ...     return batch
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
        >>> ds = ds.map(map_to_array)
        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1

--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -1294,7 +1294,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")
-        >>> dataset = load_dataset("nielsr/funsd", split="train")
+        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]