Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
...@@ -567,7 +567,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase): ...@@ -567,7 +567,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor) image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
file = image[0]["file"] file = image[0]["file"]
outputs = image_segmenter(file, threshold=threshold) outputs = image_segmenter(file, threshold=threshold)
...@@ -621,7 +621,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase): ...@@ -621,7 +621,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
def test_oneformer(self): def test_oneformer(self):
image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny") image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
file = image[0]["file"] file = image[0]["file"]
outputs = image_segmenter(file, threshold=0.99) outputs = image_segmenter(file, threshold=0.99)
# Shortening by hashing # Shortening by hashing
......
...@@ -178,7 +178,7 @@ class GgufIntegrationTests(unittest.TestCase): ...@@ -178,7 +178,7 @@ class GgufIntegrationTests(unittest.TestCase):
gguf_tokenizer = AutoTokenizer.from_pretrained(self.model_id, gguf_file=self.q8_0_gguf_model_id) gguf_tokenizer = AutoTokenizer.from_pretrained(self.model_id, gguf_file=self.q8_0_gguf_model_id)
original_tokenizer = AutoTokenizer.from_pretrained(self.original_model_id) original_tokenizer = AutoTokenizer.from_pretrained(self.original_model_id)
dataset = load_dataset("code_x_glue_ct_code_to_text", "go") dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
for item in tqdm.tqdm(dataset["validation"]): for item in tqdm.tqdm(dataset["validation"]):
string = item["code"] string = item["code"]
encoded1 = gguf_tokenizer.encode(string) encoded1 = gguf_tokenizer.encode(string)
...@@ -191,7 +191,7 @@ class GgufIntegrationTests(unittest.TestCase): ...@@ -191,7 +191,7 @@ class GgufIntegrationTests(unittest.TestCase):
self.assertEqual(decoded1, decoded2) self.assertEqual(decoded1, decoded2)
dataset = load_dataset("xnli", "all_languages") dataset = load_dataset("facebook/xnli", "all_languages")
for i, item in enumerate(tqdm.tqdm(dataset["train"].select(range(100)))): for i, item in enumerate(tqdm.tqdm(dataset["train"].select(range(100)))):
for string in item["premise"].values(): for string in item["premise"].values():
......
...@@ -253,7 +253,7 @@ def main(): ...@@ -253,7 +253,7 @@ def main():
# download the dataset. # download the dataset.
if data_args.task_name is not None: if data_args.task_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset("glue", data_args.task_name) datasets = load_dataset("nyu-mll/glue", data_args.task_name)
else: else:
# Loading a dataset from your local files. # Loading a dataset from your local files.
# CSV/JSON training and evaluation files are needed. # CSV/JSON training and evaluation files are needed.
......
...@@ -56,7 +56,7 @@ if __name__ == "__main__": ...@@ -56,7 +56,7 @@ if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
# Load dataset # Load dataset
train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"]) train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"])
train_dataset = train_dataset.shuffle().select(range(5000)) # smaller the size for train dataset to 5k train_dataset = train_dataset.shuffle().select(range(5000)) # smaller the size for train dataset to 5k
test_dataset = test_dataset.shuffle().select(range(500)) # smaller the size for test dataset to 500 test_dataset = test_dataset.shuffle().select(range(500)) # smaller the size for test dataset to 500
......
...@@ -50,7 +50,7 @@ def fit(model, loss, opt, train_dataset, epochs, train_batch_size, max_steps=Non ...@@ -50,7 +50,7 @@ def fit(model, loss, opt, train_dataset, epochs, train_batch_size, max_steps=Non
def get_datasets(tokenizer, train_batch_size, eval_batch_size): def get_datasets(tokenizer, train_batch_size, eval_batch_size):
# Load dataset # Load dataset
train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"]) train_dataset, test_dataset = load_dataset("stanfordnlp/imdb", split=["train", "test"])
# Preprocess train dataset # Preprocess train dataset
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
......
...@@ -43,8 +43,8 @@ class Seq2seqTrainerTester(TestCasePlus): ...@@ -43,8 +43,8 @@ class Seq2seqTrainerTester(TestCasePlus):
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.max_length = 128 bert2bert.config.max_length = 128
train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]") train_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0", split="train[:1%]")
val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]") val_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0", split="validation[:1%]")
train_dataset = train_dataset.select(range(32)) train_dataset = train_dataset.select(range(32))
val_dataset = val_dataset.select(range(16)) val_dataset = val_dataset.select(range(16))
...@@ -145,7 +145,7 @@ class Seq2seqTrainerTester(TestCasePlus): ...@@ -145,7 +145,7 @@ class Seq2seqTrainerTester(TestCasePlus):
MAX_INPUT_LENGTH = 256 MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 256 MAX_TARGET_LENGTH = 256
dataset = datasets.load_dataset("gsm8k", "main", split="train[:38]") dataset = datasets.load_dataset("openai/gsm8k", "main", split="train[:38]")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small") model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest") data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
......
...@@ -259,7 +259,9 @@ class AudioUtilsFunctionTester(unittest.TestCase): ...@@ -259,7 +259,9 @@ class AudioUtilsFunctionTester(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]
......
...@@ -1080,7 +1080,7 @@ def build(config_class, models_to_create, output_dir): ...@@ -1080,7 +1080,7 @@ def build(config_class, models_to_create, output_dir):
it. Models in different frameworks with the same architecture will be saved in the same subdirectory. it. Models in different frameworks with the same architecture will be saved in the same subdirectory.
""" """
if data["training_ds"] is None or data["testing_ds"] is None: if data["training_ds"] is None or data["testing_ds"] is None:
ds = load_dataset("wikitext", "wikitext-2-raw-v1") ds = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")
data["training_ds"] = ds["train"] data["training_ds"] = ds["train"]
data["testing_ds"] = ds["test"] data["testing_ds"] = ds["test"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment