Unverified Commit ffbcfc01 authored by V.Prasanna kumar's avatar V.Prasanna kumar Committed by GitHub
Browse files

Broken links fixed related to datasets docs (#27569)

fixed the broken links belogs to dataset library of transformers
parent 638d4998
...@@ -10,7 +10,7 @@ way which enables simple and efficient model parallelism. ...@@ -10,7 +10,7 @@ way which enables simple and efficient model parallelism.
`run_wav2vec2_pretrain_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets library or use your own files (jsonlines or csv), then pretrain the wav2vec2 architectures above on it. `run_wav2vec2_pretrain_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets library or use your own files (jsonlines or csv), then pretrain the wav2vec2 architectures above on it.
For custom datasets in `jsonlines` format please see: [the Datasets documentation](https://huggingface.co/docs/datasets/loading_datasets.html#json-files) and you also will find examples of these below. For custom datasets in `jsonlines` format please see: [the Datasets documentation](https://huggingface.co/docs/datasets/loading_datasets#json-files) and you also will find examples of these below.
Let's start by creating a model repository to save the trained model and logs. Let's start by creating a model repository to save the trained model and logs.
Here we call the model `"wav2vec2-base-robust"`, but you can change the model name as you like. Here we call the model `"wav2vec2-base-robust"`, but you can change the model name as you like.
......
...@@ -294,7 +294,7 @@ def main(): ...@@ -294,7 +294,7 @@ def main():
for split in raw_datasets.keys(): for split in raw_datasets.keys():
raw_datasets[split] = raw_datasets[split].select(range(100)) raw_datasets[split] = raw_datasets[split].select(range(100))
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
if raw_datasets["train"] is not None: if raw_datasets["train"] is not None:
column_names = raw_datasets["train"].column_names column_names = raw_datasets["train"].column_names
......
...@@ -278,7 +278,7 @@ def main(): ...@@ -278,7 +278,7 @@ def main():
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files) datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# Load pretrained model and tokenizer # Load pretrained model and tokenizer
# #
......
...@@ -524,7 +524,7 @@ if __name__ == "__main__": ...@@ -524,7 +524,7 @@ if __name__ == "__main__":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files) datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# Load pretrained model and tokenizer # Load pretrained model and tokenizer
......
...@@ -272,7 +272,7 @@ if args.dataset_name is not None: ...@@ -272,7 +272,7 @@ if args.dataset_name is not None:
else: else:
raise ValueError("Evaluation requires a dataset name") raise ValueError("Evaluation requires a dataset name")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slighlty different for training and evaluation.
......
...@@ -308,7 +308,7 @@ def main(): ...@@ -308,7 +308,7 @@ def main():
extension = data_args.test_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# set default quantization parameters before building model # set default quantization parameters before building model
quant_trainer.set_default_quantizers(quant_trainer_args) quant_trainer.set_default_quantizers(quant_trainer_args)
......
...@@ -65,7 +65,7 @@ def main( ...@@ -65,7 +65,7 @@ def main(
"csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"] "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
) )
# More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets?highlight=csv#csv-files
# Then split the documents into passages of 100 words # Then split the documents into passages of 100 words
dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc) dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)
......
...@@ -73,7 +73,7 @@ def main( ...@@ -73,7 +73,7 @@ def main(
"csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"] "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
) )
# More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets?highlight=csv#csv-files
# Then split the documents into passages of 100 words # Then split the documents into passages of 100 words
dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc) dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)
......
...@@ -112,7 +112,7 @@ Hugging Face Hub for additional audio data, for example by selecting the categor ...@@ -112,7 +112,7 @@ Hugging Face Hub for additional audio data, for example by selecting the categor
["speech-processing"](https://huggingface.co/datasets?task_categories=task_categories:speech-processing&sort=downloads). ["speech-processing"](https://huggingface.co/datasets?task_categories=task_categories:speech-processing&sort=downloads).
All datasets that are available on the Hub can be downloaded via the 🤗 Datasets library in the same way Common Voice is downloaded. All datasets that are available on the Hub can be downloaded via the 🤗 Datasets library in the same way Common Voice is downloaded.
If one wants to combine multiple datasets for training, it might make sense to take a look at If one wants to combine multiple datasets for training, it might make sense to take a look at
the [`interleave_datasets`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=interleave#datasets.interleave_datasets) function. the [`interleave_datasets`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=interleave#datasets.interleave_datasets) function.
In addition, participants can also make use of their audio data. Here, please make sure that you **are allowed to use the audio data**. E.g., if audio data In addition, participants can also make use of their audio data. Here, please make sure that you **are allowed to use the audio data**. E.g., if audio data
is taken from media platforms, such as YouTube, it should be verified that the media platform and the owner of the data have given her/his approval to use the audio is taken from media platforms, such as YouTube, it should be verified that the media platform and the owner of the data have given her/his approval to use the audio
......
...@@ -277,7 +277,7 @@ def main(): ...@@ -277,7 +277,7 @@ def main():
# Loading a dataset from local json files # Loading a dataset from local json files
raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset at # See more about loading any type of standard or custom dataset at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# Labels # Labels
label_list = raw_datasets["train"].features["label"].names label_list = raw_datasets["train"].features["label"].names
......
...@@ -317,7 +317,7 @@ def main(): ...@@ -317,7 +317,7 @@ def main():
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# Load pretrained model and tokenizer # Load pretrained model and tokenizer
# #
......
...@@ -315,7 +315,7 @@ def main(): ...@@ -315,7 +315,7 @@ def main():
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# Load pretrained model and tokenizer # Load pretrained model and tokenizer
# #
......
...@@ -361,7 +361,7 @@ def main(): ...@@ -361,7 +361,7 @@ def main():
token=model_args.token, token=model_args.token,
) )
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# 5. Load pretrained model, tokenizer, and image processor # 5. Load pretrained model, tokenizer, and image processor
if model_args.tokenizer_name: if model_args.tokenizer_name:
......
...@@ -316,7 +316,7 @@ def main(): ...@@ -316,7 +316,7 @@ def main():
task="image-classification", task="image-classification",
) )
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# Prepare label mappings. # Prepare label mappings.
# We'll include these in the model's config to get human readable labels in the Inference API. # We'll include these in the model's config to get human readable labels in the Inference API.
......
...@@ -371,7 +371,7 @@ def main(): ...@@ -371,7 +371,7 @@ def main():
**dataset_args, **dataset_args,
) )
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# endregion # endregion
# region Load pretrained model and tokenizer # region Load pretrained model and tokenizer
......
...@@ -353,7 +353,7 @@ def main(): ...@@ -353,7 +353,7 @@ def main():
) )
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# endregion # endregion
# region Load pretrained model and tokenizer # region Load pretrained model and tokenizer
......
...@@ -338,7 +338,7 @@ def main(): ...@@ -338,7 +338,7 @@ def main():
token=model_args.token, token=model_args.token,
) )
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# When using your own dataset or a different dataset from swag, you will probably need to change this. # When using your own dataset or a different dataset from swag, you will probably need to change this.
ending_names = [f"ending{i}" for i in range(4)] ending_names = [f"ending{i}" for i in range(4)]
......
...@@ -352,7 +352,7 @@ def main(): ...@@ -352,7 +352,7 @@ def main():
token=model_args.token, token=model_args.token,
) )
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# endregion # endregion
# region Load pretrained model and tokenizer # region Load pretrained model and tokenizer
......
...@@ -401,7 +401,7 @@ def main(): ...@@ -401,7 +401,7 @@ def main():
token=model_args.token, token=model_args.token,
) )
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
# endregion # endregion
# region Load model config and tokenizer # region Load model config and tokenizer
......
...@@ -271,7 +271,7 @@ def main(): ...@@ -271,7 +271,7 @@ def main():
token=model_args.token, token=model_args.token,
) )
# See more about loading any type of standard or custom dataset at # See more about loading any type of standard or custom dataset at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.
is_regression = data_args.task_name == "stsb" is_regression = data_args.task_name == "stsb"
if not is_regression: if not is_regression:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment