Unverified Commit 721ee783 authored by Klaus Hipp's avatar Klaus Hipp Committed by GitHub
Browse files

[Docs] Fix spelling and grammar mistakes (#28825)

* Fix typos and grammar mistakes in docs and examples

* Fix typos in docstrings and comments

* Fix spelling of `tokenizer` in model tests

* Remove erroneous spaces in decorators

* Remove extra spaces in Markdown link texts
parent 2418c64a
......@@ -175,7 +175,7 @@ def parse_args():
default=128,
help=(
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
" sequences shorter will be padded if `--pad_to_max_lengh` is passed."
" sequences shorter will be padded if `--pad_to_max_length` is passed."
),
)
parser.add_argument(
......
......@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer):
steps_trained_in_current_epoch = 0
# Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path):
# set global_step to gobal_step of last saved checkpoint from model path
# set global_step to global_step of last saved checkpoint from model path
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
......@@ -169,7 +169,7 @@ def train(args, train_dataset, model, tokenizer):
desc="Epoch",
disable=args.local_rank not in [-1, 0],
)
set_seed(args) # Added here for reproductibility
set_seed(args) # Added here for reproducibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
......@@ -614,7 +614,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")
......
......@@ -60,7 +60,7 @@ def is_autogenerated(example, scan_width=5):
def is_config_or_test(example, scan_width=5, coeff=0.05):
"""Check if file is a configuration file or a unit test by :
1- looking for keywords in the first few lines of the file.
2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
2- counting number of occurrence of the words 'config' and 'test' with respect to number of lines.
"""
keywords = ["unit tests", "test file", "configuration file"]
......
......@@ -162,7 +162,7 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
......@@ -491,7 +491,7 @@ def main():
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
......@@ -566,7 +566,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")
......
......@@ -165,7 +165,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
# Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path):
try:
# set global_step to gobal_step of last saved checkpoint from model path
# set global_step to global_step of last saved checkpoint from model path
checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
global_step = int(checkpoint_suffix)
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
......@@ -183,7 +183,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
# Added here for reproductibility
# Added here for reproducibility
set_seed(args)
for _ in train_iterator:
......@@ -731,7 +731,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")
......
......@@ -134,7 +134,7 @@ def train(args, train_dataset, model, tokenizer, criterion):
best_f1, n_no_improve = 0, 0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility
set_seed(args) # Added here for reproducibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
......@@ -384,7 +384,7 @@ def main():
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
......@@ -460,7 +460,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")
......
......@@ -275,7 +275,7 @@ else:
# https://huggingface.co/docs/datasets/loading_datasets.
# Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation.
# Preprocessing is slightly different for training and evaluation.
column_names = raw_datasets["validation"].column_names
......
......@@ -349,7 +349,7 @@ def main():
)
# Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation.
# Preprocessing is slightly different for training and evaluation.
if training_args.do_train or model_args.do_calib:
column_names = raw_datasets["train"].column_names
elif training_args.do_eval or model_args.save_onnx:
......@@ -448,7 +448,7 @@ def main():
raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified
# We will select sample from whole data if argument is specified
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset
......
......@@ -239,7 +239,7 @@ For example,
./save_len_file.py Helsinki-NLP/opus-mt-en-ro wmt_en_ro
./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
```
splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100.
splits `wmt_en_ro/train` into 11,197 uneven length batches and can finish 1 epoch in 8 minutes on a v100.
For comparison,
```bash
......
......@@ -94,7 +94,7 @@ def run_generate(verbose=True):
parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
parser.add_argument(
"--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples"
"--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
)
parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
......
......@@ -69,12 +69,12 @@ class ModelArguments:
hidden_dropout: Optional[float] = field(
default=0.1,
metadata={
"help": "The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler."
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
},
)
feat_proj_dropout: Optional[float] = field(
default=0.1,
metadata={"help": "The dropout probabilitiy for all 1D convolutional layers in feature extractor."},
metadata={"help": "The dropout probability for all 1D convolutional layers in feature extractor."},
)
mask_time_prob: Optional[float] = field(
default=0.05,
......
......@@ -311,7 +311,7 @@ def main():
# Log on each process the small summary:
logger.info(f"Training/evaluation parameters {training_args}")
# 3. Detecting last checkpoint and eventualy continue from last checkpoint
# 3. Detecting last checkpoint and eventually continue from last checkpoint
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
......
......@@ -107,10 +107,10 @@ from datasets import load_dataset
# example 1: local folder
dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
# example 2: local files (suppoted formats are tar, gzip, zip, xz, rar, zstd)
# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
# example 3: remote files (suppoted formats are tar, gzip, zip, xz, rar, zstd)
# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
# example 4: providing several splits
......
......@@ -109,7 +109,7 @@ def main(args):
tokenizer.decoder = decoders.Metaspace()
if args.export_to_hub:
logger.info("Exporting the trained tokenzier to Hub.")
logger.info("Exporting the trained tokenizer to Hub.")
new_tokenizer = AlbertTokenizerFast(tokenizer_object=tokenizer)
new_tokenizer.push_to_hub("unigram-tokenizer-dataset")
......
......@@ -512,7 +512,7 @@ def main():
raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"]
if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified
# We will select sample from whole data if argument is specified
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset
......
......@@ -211,7 +211,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
self.input_shape = input_shape
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
# To check if the model was intialized automatically.
# To check if the model was initialized automatically.
self._is_initialized = _do_init
if _do_init:
......
......@@ -3736,7 +3736,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
else:
hf_quantizer.create_quantized_param(model, value, key, "cpu", state_dict)
# retrieve unintialized modules and initialize before maybe overriding that with the pretrained weights.
# retrieve uninitialized modules and initialize before maybe overriding that with the pretrained weights.
if _fast_init:
if not ignore_mismatched_sizes:
if remove_prefix_from_model:
......
......@@ -58,7 +58,7 @@ class BigBirdConfig(PretrainedConfig):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 4096):
......
......@@ -53,7 +53,7 @@ class BioGptConfig(PretrainedConfig):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 1024):
......
......@@ -50,7 +50,7 @@ class CanineConfig(PretrainedConfig):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoders, and pooler.
The dropout probability for all fully connected layers in the embeddings, encoders, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 16384):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment