Unverified Commit 721ee783 authored by Klaus Hipp's avatar Klaus Hipp Committed by GitHub
Browse files

[Docs] Fix spelling and grammar mistakes (#28825)

* Fix typos and grammar mistakes in docs and examples

* Fix typos in docstrings and comments

* Fix spelling of `tokenizer` in model tests

* Remove erroneous spaces in decorators

* Remove extra spaces in Markdown link texts
parent 2418c64a
...@@ -154,7 +154,7 @@ def run_generate(): ...@@ -154,7 +154,7 @@ def run_generate():
parser.add_argument("--src_lang", type=str, default=None, required=False) parser.add_argument("--src_lang", type=str, default=None, required=False)
parser.add_argument("--tgt_lang", type=str, default=None, required=False) parser.add_argument("--tgt_lang", type=str, default=None, required=False)
parser.add_argument( parser.add_argument(
"--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples" "--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
) )
parser.add_argument("--fp16", action="store_true") parser.add_argument("--fp16", action="store_true")
parser.add_argument("--debug", action="store_true") parser.add_argument("--debug", action="store_true")
......
...@@ -107,7 +107,7 @@ def run_generate(verbose=True): ...@@ -107,7 +107,7 @@ def run_generate(verbose=True):
parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics") parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.") parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
parser.add_argument( parser.add_argument(
"--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples" "--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
) )
parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics") parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
parser.add_argument("--bs", type=int, default=8, required=False, help="batch size") parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
......
...@@ -65,7 +65,7 @@ class Seq2SeqTrainer(Trainer): ...@@ -65,7 +65,7 @@ class Seq2SeqTrainer(Trainer):
if self.args.label_smoothing != 0 or (self.data_args is not None and self.data_args.ignore_pad_token_for_loss): if self.args.label_smoothing != 0 or (self.data_args is not None and self.data_args.ignore_pad_token_for_loss):
assert self.config.pad_token_id is not None, ( assert self.config.pad_token_id is not None, (
"Make sure that `config.pad_token_id` is correcly defined when ignoring `pad_token` for loss" "Make sure that `config.pad_token_id` is correctly defined when ignoring `pad_token` for loss"
" calculation or doing label smoothing." " calculation or doing label smoothing."
) )
......
...@@ -31,7 +31,7 @@ class Seq2SeqTrainingArguments(TrainingArguments): ...@@ -31,7 +31,7 @@ class Seq2SeqTrainingArguments(TrainingArguments):
label_smoothing (:obj:`float`, `optional`, defaults to 0): label_smoothing (:obj:`float`, `optional`, defaults to 0):
The label smoothing epsilon to apply (if not zero). The label smoothing epsilon to apply (if not zero).
sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`): sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to SortishSamler or not. It sorts the inputs according to lengths in-order to minimizing the padding size. Whether to SortishSampler or not. It sorts the inputs according to lengths in-order to minimizing the padding size.
predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`): predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to use generate to calculate generative metrics (ROUGE, BLEU). Whether to use generate to calculate generative metrics (ROUGE, BLEU).
""" """
...@@ -39,7 +39,7 @@ class Seq2SeqTrainingArguments(TrainingArguments): ...@@ -39,7 +39,7 @@ class Seq2SeqTrainingArguments(TrainingArguments):
label_smoothing: Optional[float] = field( label_smoothing: Optional[float] = field(
default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."} default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
) )
sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSamler or not."}) sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSampler or not."})
predict_with_generate: bool = field( predict_with_generate: bool = field(
default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
) )
......
...@@ -289,7 +289,7 @@ def main(): ...@@ -289,7 +289,7 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# 3. Detecting last checkpoint and eventualy continue from last checkpoint # 3. Detecting last checkpoint and eventually continue from last checkpoint
last_checkpoint = None last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir) last_checkpoint = get_last_checkpoint(training_args.output_dir)
...@@ -528,7 +528,7 @@ def main(): ...@@ -528,7 +528,7 @@ def main():
# Transform images on the fly as doing it on the whole dataset takes too much time. # Transform images on the fly as doing it on the whole dataset takes too much time.
test_dataset.set_transform(transform_images) test_dataset.set_transform(transform_images)
# 8. Initalize our trainer # 8. Initialize our trainer
trainer = Trainer( trainer = Trainer(
model=model, model=model,
args=training_args, args=training_args,
......
...@@ -114,10 +114,10 @@ from datasets import load_dataset ...@@ -114,10 +114,10 @@ from datasets import load_dataset
# example 1: local folder # example 1: local folder
dataset = load_dataset("imagefolder", data_dir="path_to_your_folder") dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
# example 2: local files (suppoted formats are tar, gzip, zip, xz, rar, zstd) # example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
dataset = load_dataset("imagefolder", data_files="path_to_zip_file") dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
# example 3: remote files (suppoted formats are tar, gzip, zip, xz, rar, zstd) # example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip") dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
# example 4: providing several splits # example 4: providing several splits
......
...@@ -404,7 +404,7 @@ def main(): ...@@ -404,7 +404,7 @@ def main():
# Set the validation transforms # Set the validation transforms
dataset["validation"].set_transform(val_transforms) dataset["validation"].set_transform(val_transforms)
# Initalize our trainer # Initialize our trainer
trainer = Trainer( trainer = Trainer(
model=model, model=model,
args=training_args, args=training_args,
......
...@@ -25,7 +25,7 @@ NOTE: If you encounter problems/have suggestions for improvement, open an issue ...@@ -25,7 +25,7 @@ NOTE: If you encounter problems/have suggestions for improvement, open an issue
## SimMIM ## SimMIM
The `run_mim.py` script can be used to pre-train any Transformer-based vision model in the library (concretly, any model supported by the `AutoModelForMaskedImageModeling` API) for masked image modeling as proposed in [SimMIM: A Simple Framework for Masked Image Modeling](https://arxiv.org/abs/2111.09886) using PyTorch. The `run_mim.py` script can be used to pre-train any Transformer-based vision model in the library (concretely, any model supported by the `AutoModelForMaskedImageModeling` API) for masked image modeling as proposed in [SimMIM: A Simple Framework for Masked Image Modeling](https://arxiv.org/abs/2111.09886) using PyTorch.
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/simmim_architecture.jpg" <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/simmim_architecture.jpg"
alt="drawing" width="300"/> alt="drawing" width="300"/>
......
...@@ -90,7 +90,7 @@ def parse_args(): ...@@ -90,7 +90,7 @@ def parse_args():
default=128, default=128,
help=( help=(
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
" sequences shorter will be padded if `--pad_to_max_lengh` is passed." " sequences shorter will be padded if `--pad_to_max_length` is passed."
), ),
) )
parser.add_argument( parser.add_argument(
......
...@@ -378,7 +378,7 @@ def main(): ...@@ -378,7 +378,7 @@ def main():
) )
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slightly different for training and evaluation.
if training_args.do_train: if training_args.do_train:
column_names = raw_datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval: elif training_args.do_eval:
......
...@@ -354,7 +354,7 @@ def main(): ...@@ -354,7 +354,7 @@ def main():
) )
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slightly different for training and evaluation.
if training_args.do_train: if training_args.do_train:
column_names = raw_datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval: elif training_args.do_eval:
......
...@@ -119,7 +119,7 @@ def parse_args(): ...@@ -119,7 +119,7 @@ def parse_args():
default=384, default=384,
help=( help=(
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
" sequences shorter will be padded if `--pad_to_max_lengh` is passed." " sequences shorter will be padded if `--pad_to_max_length` is passed."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -385,7 +385,7 @@ def main(): ...@@ -385,7 +385,7 @@ def main():
) )
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slightly different for training and evaluation.
column_names = raw_datasets["train"].column_names column_names = raw_datasets["train"].column_names
question_column_name = "question" if "question" in column_names else column_names[0] question_column_name = "question" if "question" in column_names else column_names[0]
...@@ -508,7 +508,7 @@ def main(): ...@@ -508,7 +508,7 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if args.max_train_samples is not None: if args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if argument is specified
train_dataset = train_dataset.select(range(args.max_train_samples)) train_dataset = train_dataset.select(range(args.max_train_samples))
# Create train feature from dataset # Create train feature from dataset
with accelerator.main_process_first(): with accelerator.main_process_first():
...@@ -877,7 +877,7 @@ def main(): ...@@ -877,7 +877,7 @@ def main():
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
) )
# intialize all lists to collect the batches # initialize all lists to collect the batches
all_start_top_log_probs = [] all_start_top_log_probs = []
all_start_top_index = [] all_start_top_index = []
all_end_top_log_probs = [] all_end_top_log_probs = []
...@@ -936,7 +936,7 @@ def main(): ...@@ -936,7 +936,7 @@ def main():
logger.info(f"Evaluation metrics: {eval_metric}") logger.info(f"Evaluation metrics: {eval_metric}")
if args.do_predict: if args.do_predict:
# intialize all lists to collect the batches # initialize all lists to collect the batches
all_start_top_log_probs = [] all_start_top_log_probs = []
all_start_top_index = [] all_start_top_index = []
......
...@@ -123,7 +123,7 @@ def parse_args(): ...@@ -123,7 +123,7 @@ def parse_args():
default=384, default=384,
help=( help=(
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
" sequences shorter will be padded if `--pad_to_max_lengh` is passed." " sequences shorter will be padded if `--pad_to_max_length` is passed."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -460,7 +460,7 @@ def main(): ...@@ -460,7 +460,7 @@ def main():
model = AutoModelForQuestionAnswering.from_config(config, trust_remote_code=args.trust_remote_code) model = AutoModelForQuestionAnswering.from_config(config, trust_remote_code=args.trust_remote_code)
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slightly different for training and evaluation.
column_names = raw_datasets["train"].column_names column_names = raw_datasets["train"].column_names
...@@ -561,7 +561,7 @@ def main(): ...@@ -561,7 +561,7 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if args.max_train_samples is not None: if args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if argument is specified
train_dataset = train_dataset.select(range(args.max_train_samples)) train_dataset = train_dataset.select(range(args.max_train_samples))
# Create train feature from dataset # Create train feature from dataset
......
...@@ -559,7 +559,7 @@ def main(): ...@@ -559,7 +559,7 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if argument is specified
max_train_samples = min(len(train_dataset), data_args.max_train_samples) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples)) train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset # Create train feature from dataset
......
...@@ -503,7 +503,7 @@ def main(): ...@@ -503,7 +503,7 @@ def main():
# Set the validation transforms # Set the validation transforms
dataset["validation"].set_transform(preprocess_val) dataset["validation"].set_transform(preprocess_val)
# Initalize our trainer # Initialize our trainer
trainer = Trainer( trainer = Trainer(
model=model, model=model,
args=training_args, args=training_args,
......
...@@ -446,7 +446,7 @@ A very common use case is to leverage a pretrained speech encoder model, ...@@ -446,7 +446,7 @@ A very common use case is to leverage a pretrained speech encoder model,
By pairing a pretrained speech model with a pretrained text model, the warm-started model has prior knowledge of both the source audio and target text domains. However, the cross-attention weights between the encoder and decoder are randomly initialised. Thus, the model requires fine-tuning to learn the cross-attention weights and align the encoder mapping with that of the decoder. We can perform this very fine-tuning procedure using the example script. By pairing a pretrained speech model with a pretrained text model, the warm-started model has prior knowledge of both the source audio and target text domains. However, the cross-attention weights between the encoder and decoder are randomly initialised. Thus, the model requires fine-tuning to learn the cross-attention weights and align the encoder mapping with that of the decoder. We can perform this very fine-tuning procedure using the example script.
As an example, let's instantiate a *Wav2Vec2-2-Bart* model with the `SpeechEnocderDecoderModel` framework. First create an empty repo on `hf.co`: As an example, let's instantiate a *Wav2Vec2-2-Bart* model with the `SpeechEncoderDecoderModel` framework. First create an empty repo on `hf.co`:
```bash ```bash
huggingface-cli repo create wav2vec2-2-bart-base huggingface-cli repo create wav2vec2-2-bart-base
...@@ -506,7 +506,7 @@ Having warm-started the speech-encoder-decoder model under `<your-user-name>/wav ...@@ -506,7 +506,7 @@ Having warm-started the speech-encoder-decoder model under `<your-user-name>/wav
In the script [`run_speech_recognition_seq2seq`], we load the warm-started model, In the script [`run_speech_recognition_seq2seq`], we load the warm-started model,
feature extractor, and tokenizer, process a speech recognition dataset, feature extractor, and tokenizer, process a speech recognition dataset,
and subsequently make use of the [`Seq2SeqTrainer`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) to train our system. and subsequently make use of the [`Seq2SeqTrainer`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) to train our system.
Note that it is important to align the target transcriptions with the decoder's vocabulary. For example, the [`Librispeech`](https://huggingface.co/datasets/librispeech_asr) dataset only contains captilized letters in the transcriptions, Note that it is important to align the target transcriptions with the decoder's vocabulary. For example, the [`Librispeech`](https://huggingface.co/datasets/librispeech_asr) dataset only contains capitalized letters in the transcriptions,
whereas BART was pretrained mostly on normalized text. Thus, it is recommended to add the argument whereas BART was pretrained mostly on normalized text. Thus, it is recommended to add the argument
`--do_lower_case` to the fine-tuning script when using a warm-started `SpeechEncoderDecoderModel`. `--do_lower_case` to the fine-tuning script when using a warm-started `SpeechEncoderDecoderModel`.
The model is fine-tuned on the standard cross-entropy language modeling The model is fine-tuned on the standard cross-entropy language modeling
......
...@@ -146,7 +146,7 @@ class DataTrainingArguments: ...@@ -146,7 +146,7 @@ class DataTrainingArguments:
" should be trained on in ISO 693-3 code, e.g. `tur` for Turkish" " should be trained on in ISO 693-3 code, e.g. `tur` for Turkish"
" Wav2Vec2's MMS ISO codes can be looked up here: https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html" " Wav2Vec2's MMS ISO codes can be looked up here: https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html"
" If you are not training the adapter layers on a language, simply choose" " If you are not training the adapter layers on a language, simply choose"
" another accronym that fits your data." " another acronym that fits your data."
) )
}, },
) )
......
...@@ -129,7 +129,7 @@ python run_classification.py \ ...@@ -129,7 +129,7 @@ python run_classification.py \
--num_train_epochs 15 \ --num_train_epochs 15 \
--output_dir /tmp/${dataset}_${subset}/ --output_dir /tmp/${dataset}_${subset}/
``` ```
It results in a Micro F1 score of around 0.82 without any text and label filtering. Note that you have to explictly remove the "unused" split from the dataset, since it is not used for classification. It results in a Micro F1 score of around 0.82 without any text and label filtering. Note that you have to explicitly remove the "unused" split from the dataset, since it is not used for classification.
### Mixed precision training ### Mixed precision training
......
...@@ -83,7 +83,7 @@ class DataTrainingArguments: ...@@ -83,7 +83,7 @@ class DataTrainingArguments:
metadata={ metadata={
"help": ( "help": (
"The name of the text column in the input dataset or a CSV/JSON file. " "The name of the text column in the input dataset or a CSV/JSON file. "
'If not specified, will use the "sentence" column for single/multi-label classifcation task.' 'If not specified, will use the "sentence" column for single/multi-label classification task.'
) )
}, },
) )
...@@ -121,7 +121,7 @@ class DataTrainingArguments: ...@@ -121,7 +121,7 @@ class DataTrainingArguments:
metadata={ metadata={
"help": ( "help": (
"The name of the label column in the input dataset or a CSV/JSON file. " "The name of the label column in the input dataset or a CSV/JSON file. "
'If not specified, will use the "label" column for single/multi-label classifcation task' 'If not specified, will use the "label" column for single/multi-label classification task'
) )
}, },
) )
...@@ -260,7 +260,7 @@ class ModelArguments: ...@@ -260,7 +260,7 @@ class ModelArguments:
def get_label_list(raw_dataset, split="train") -> List[str]: def get_label_list(raw_dataset, split="train") -> List[str]:
"""Get the list of labels from a mutli-label dataset""" """Get the list of labels from a multi-label dataset"""
if isinstance(raw_dataset[split]["label"][0], list): if isinstance(raw_dataset[split]["label"][0], list):
label_list = [label for sample in raw_dataset[split]["label"] for label in sample] label_list = [label for sample in raw_dataset[split]["label"] for label in sample]
...@@ -343,7 +343,7 @@ def main(): ...@@ -343,7 +343,7 @@ def main():
# Get the datasets: you can either provide your own CSV/JSON training and evaluation files, or specify a dataset name # Get the datasets: you can either provide your own CSV/JSON training and evaluation files, or specify a dataset name
# to load from huggingface/datasets. In ether case, you can specify a the key of the column(s) containing the text and # to load from huggingface/datasets. In ether case, you can specify a the key of the column(s) containing the text and
# the key of the column containing the label. If multiple columns are specified for the text, they will be joined togather # the key of the column containing the label. If multiple columns are specified for the text, they will be joined together
# for the actual text value. # for the actual text value.
# In distributed training, the load_dataset function guarantee that only one local process can concurrently # In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset. # download the dataset.
......
...@@ -18,7 +18,7 @@ limitations under the License. ...@@ -18,7 +18,7 @@ limitations under the License.
Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py). Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, GPTJ, Transformer-XL, XLNet, CTRL, BLOOM, LLAMA, OPT. Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, GPT-J, Transformer-XL, XLNet, CTRL, BLOOM, LLAMA, OPT.
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
can try out the different models available in the library. can try out the different models available in the library.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment