Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
......@@ -567,7 +567,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
file = image[0]["file"]
outputs = image_segmenter(file, threshold=threshold)
......@@ -621,7 +621,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
def test_oneformer(self):
image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
file = image[0]["file"]
outputs = image_segmenter(file, threshold=0.99)
# Shortening by hashing
......
......@@ -178,7 +178,7 @@ class GgufIntegrationTests(unittest.TestCase):
gguf_tokenizer = AutoTokenizer.from_pretrained(self.model_id, gguf_file=self.q8_0_gguf_model_id)
original_tokenizer = AutoTokenizer.from_pretrained(self.original_model_id)
dataset = load_dataset("code_x_glue_ct_code_to_text", "go")
dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
for item in tqdm.tqdm(dataset["validation"]):
string = item["code"]
encoded1 = gguf_tokenizer.encode(string)
......@@ -191,7 +191,7 @@ class GgufIntegrationTests(unittest.TestCase):
self.assertEqual(decoded1, decoded2)
dataset = load_dataset("xnli", "all_languages")
dataset = load_dataset("facebook/xnli", "all_languages")
for i, item in enumerate(tqdm.tqdm(dataset["train"].select(range(100)))):
for string in item["premise"].values():
......
......@@ -253,7 +253,7 @@ def main():
# download the dataset.
if data_args.task_name is not None:
# Downloading and loading a dataset from the hub.
datasets = load_dataset("glue", data_args.task_name)
datasets = load_dataset("nyu-mll/glue", data_args.task_name)
else:
# Loading a dataset from your local files.
# CSV/JSON training and evaluation files are needed.
......
......@@ -56,7 +56,7 @@ if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
# Load dataset
train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"])
train_dataset = train_dataset.shuffle().select(range(5000)) # smaller the size for train dataset to 5k
test_dataset = test_dataset.shuffle().select(range(500)) # smaller the size for test dataset to 500
......
......@@ -50,7 +50,7 @@ def fit(model, loss, opt, train_dataset, epochs, train_batch_size, max_steps=Non
def get_datasets(tokenizer, train_batch_size, eval_batch_size):
# Load dataset
train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"])
# Preprocess train dataset
train_dataset = train_dataset.map(
......
......@@ -43,8 +43,8 @@ class Seq2seqTrainerTester(TestCasePlus):
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.max_length = 128
train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")
train_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0", split="train[:1%]")
val_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0", split="validation[:1%]")
train_dataset = train_dataset.select(range(32))
val_dataset = val_dataset.select(range(16))
......@@ -145,7 +145,7 @@ class Seq2seqTrainerTester(TestCasePlus):
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 256
dataset = datasets.load_dataset("gsm8k", "main", split="train[:38]")
dataset = datasets.load_dataset("openai/gsm8k", "main", split="train[:38]")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
......
......@@ -259,7 +259,9 @@ class AudioUtilsFunctionTester(unittest.TestCase):
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return [x["array"] for x in speech_samples]
......
......@@ -1080,7 +1080,7 @@ def build(config_class, models_to_create, output_dir):
it. Models in different frameworks with the same architecture will be saved in the same subdirectory.
"""
if data["training_ds"] is None or data["testing_ds"] is None:
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
ds = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")
data["training_ds"] = ds["train"]
data["testing_ds"] = ds["test"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment