Unverified Commit 39fa4009 authored by Klaus Hipp's avatar Klaus Hipp Committed by GitHub
Browse files

Fix input data file extension in examples (#28741)

parent 5649c0cb
...@@ -558,9 +558,10 @@ def main(): ...@@ -558,9 +558,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset( datasets = load_dataset(
......
...@@ -449,9 +449,10 @@ def main(): ...@@ -449,9 +449,10 @@ def main():
dataset_args = {} dataset_args = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
......
...@@ -485,9 +485,10 @@ def main(): ...@@ -485,9 +485,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset( datasets = load_dataset(
......
...@@ -599,9 +599,10 @@ def main(): ...@@ -599,9 +599,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset( datasets = load_dataset(
......
...@@ -345,9 +345,10 @@ def main(): ...@@ -345,9 +345,10 @@ def main():
dataset_args = {} dataset_args = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
......
...@@ -351,9 +351,10 @@ def main(): ...@@ -351,9 +351,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
......
...@@ -328,9 +328,10 @@ def main(): ...@@ -328,9 +328,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
......
...@@ -311,9 +311,10 @@ def main(): ...@@ -311,9 +311,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
raw_datasets = load_dataset( raw_datasets = load_dataset(
extension, extension,
data_files=data_files, data_files=data_files,
......
...@@ -357,9 +357,10 @@ def main(): ...@@ -357,9 +357,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples # Trim a number of training examples
if args.debug: if args.debug:
......
...@@ -362,11 +362,13 @@ def main(): ...@@ -362,11 +362,13 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.validation_file.split(".")[-1]
if args.test_file is not None: if args.test_file is not None:
data_files["test"] = args.test_file data_files["test"] = args.test_file
extension = args.train_file.split(".")[-1] extension = args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, field="data") raw_datasets = load_dataset(extension, data_files=data_files, field="data")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.
......
...@@ -410,11 +410,13 @@ def main(): ...@@ -410,11 +410,13 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.validation_file.split(".")[-1]
if args.test_file is not None: if args.test_file is not None:
data_files["test"] = args.test_file data_files["test"] = args.test_file
extension = args.train_file.split(".")[-1] extension = args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, field="data") raw_datasets = load_dataset(extension, data_files=data_files, field="data")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.
......
...@@ -404,9 +404,10 @@ def main(): ...@@ -404,9 +404,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.
......
...@@ -311,11 +311,13 @@ def main(): ...@@ -311,11 +311,13 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.validation_file.split(".")[-1]
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.train_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.
......
...@@ -339,9 +339,10 @@ def main(): ...@@ -339,9 +339,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples # Trim a number of training examples
if args.debug: if args.debug:
......
...@@ -384,9 +384,10 @@ def main(): ...@@ -384,9 +384,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.
......
...@@ -297,9 +297,10 @@ def main(): ...@@ -297,9 +297,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
......
...@@ -285,9 +285,10 @@ def main(): ...@@ -285,9 +285,10 @@ def main():
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
data_files["train"] = args.train_file data_files["train"] = args.train_file
extension = args.train_file.split(".")[-1]
if args.validation_file is not None: if args.validation_file is not None:
data_files["validation"] = args.validation_file data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1] extension = args.validation_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files) raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples # Trim a number of training examples
if args.debug: if args.debug:
......
...@@ -271,9 +271,10 @@ def main(): ...@@ -271,9 +271,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files) datasets = load_dataset(extension, data_files=data_files)
......
...@@ -517,9 +517,10 @@ if __name__ == "__main__": ...@@ -517,9 +517,10 @@ if __name__ == "__main__":
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files) datasets = load_dataset(extension, data_files=data_files)
......
...@@ -341,9 +341,10 @@ def main(): ...@@ -341,9 +341,10 @@ def main():
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
data_files["train"] = data_args.train_file data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.validation_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
raw_datasets = load_dataset( raw_datasets = load_dataset(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment