Unverified Commit 721ee783 authored by Klaus Hipp's avatar Klaus Hipp Committed by GitHub
Browse files

[Docs] Fix spelling and grammar mistakes (#28825)

* Fix typos and grammar mistakes in docs and examples

* Fix typos in docstrings and comments

* Fix spelling of `tokenizer` in model tests

* Remove erroneous spaces in decorators

* Remove extra spaces in Markdown link texts
parent 2418c64a
...@@ -175,7 +175,7 @@ def parse_args(): ...@@ -175,7 +175,7 @@ def parse_args():
default=128, default=128,
help=( help=(
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
" sequences shorter will be padded if `--pad_to_max_lengh` is passed." " sequences shorter will be padded if `--pad_to_max_length` is passed."
), ),
) )
parser.add_argument( parser.add_argument(
......
...@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer): ...@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer):
steps_trained_in_current_epoch = 0 steps_trained_in_current_epoch = 0
# Check if continuing training from a checkpoint # Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path): if os.path.exists(args.model_name_or_path):
# set global_step to gobal_step of last saved checkpoint from model path # set global_step to global_step of last saved checkpoint from model path
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
...@@ -169,7 +169,7 @@ def train(args, train_dataset, model, tokenizer): ...@@ -169,7 +169,7 @@ def train(args, train_dataset, model, tokenizer):
desc="Epoch", desc="Epoch",
disable=args.local_rank not in [-1, 0], disable=args.local_rank not in [-1, 0],
) )
set_seed(args) # Added here for reproductibility set_seed(args) # Added here for reproducibility
for _ in train_iterator: for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator): for step, batch in enumerate(epoch_iterator):
...@@ -614,7 +614,7 @@ def main(): ...@@ -614,7 +614,7 @@ def main():
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl") torch.distributed.init_process_group(backend="nccl")
......
...@@ -60,7 +60,7 @@ def is_autogenerated(example, scan_width=5): ...@@ -60,7 +60,7 @@ def is_autogenerated(example, scan_width=5):
def is_config_or_test(example, scan_width=5, coeff=0.05): def is_config_or_test(example, scan_width=5, coeff=0.05):
"""Check if file is a configuration file or a unit test by : """Check if file is a configuration file or a unit test by :
1- looking for keywords in the first few lines of the file. 1- looking for keywords in the first few lines of the file.
2- counting number of occurence of the words 'config' and 'test' with respect to number of lines. 2- counting number of occurrence of the words 'config' and 'test' with respect to number of lines.
""" """
keywords = ["unit tests", "test file", "configuration file"] keywords = ["unit tests", "test file", "configuration file"]
......
...@@ -162,7 +162,7 @@ def train(args, train_dataset, model, tokenizer, train_highway=False): ...@@ -162,7 +162,7 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
tr_loss, logging_loss = 0.0, 0.0 tr_loss, logging_loss = 0.0, 0.0
model.zero_grad() model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3) set_seed(args) # Added here for reproducibility (even between python 2 and 3)
for _ in train_iterator: for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator): for step, batch in enumerate(epoch_iterator):
...@@ -491,7 +491,7 @@ def main(): ...@@ -491,7 +491,7 @@ def main():
help="Number of updates steps to accumulate before performing a backward/update pass.", help="Number of updates steps to accumulate before performing a backward/update pass.",
) )
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument( parser.add_argument(
...@@ -566,7 +566,7 @@ def main(): ...@@ -566,7 +566,7 @@ def main():
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl") torch.distributed.init_process_group(backend="nccl")
......
...@@ -165,7 +165,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None): ...@@ -165,7 +165,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
# Check if continuing training from a checkpoint # Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path): if os.path.exists(args.model_name_or_path):
try: try:
# set global_step to gobal_step of last saved checkpoint from model path # set global_step to global_step of last saved checkpoint from model path
checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
global_step = int(checkpoint_suffix) global_step = int(checkpoint_suffix)
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
...@@ -183,7 +183,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None): ...@@ -183,7 +183,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
train_iterator = trange( train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
) )
# Added here for reproductibility # Added here for reproducibility
set_seed(args) set_seed(args)
for _ in train_iterator: for _ in train_iterator:
...@@ -731,7 +731,7 @@ def main(): ...@@ -731,7 +731,7 @@ def main():
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl") torch.distributed.init_process_group(backend="nccl")
......
...@@ -134,7 +134,7 @@ def train(args, train_dataset, model, tokenizer, criterion): ...@@ -134,7 +134,7 @@ def train(args, train_dataset, model, tokenizer, criterion):
best_f1, n_no_improve = 0, 0 best_f1, n_no_improve = 0, 0
model.zero_grad() model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility set_seed(args) # Added here for reproducibility
for _ in train_iterator: for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator): for step, batch in enumerate(epoch_iterator):
...@@ -384,7 +384,7 @@ def main(): ...@@ -384,7 +384,7 @@ def main():
help="Number of updates steps to accumulate before performing a backward/update pass.", help="Number of updates steps to accumulate before performing a backward/update pass.",
) )
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument( parser.add_argument(
...@@ -460,7 +460,7 @@ def main(): ...@@ -460,7 +460,7 @@ def main():
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl") torch.distributed.init_process_group(backend="nccl")
......
...@@ -275,7 +275,7 @@ else: ...@@ -275,7 +275,7 @@ else:
# https://huggingface.co/docs/datasets/loading_datasets. # https://huggingface.co/docs/datasets/loading_datasets.
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slightly different for training and evaluation.
column_names = raw_datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
......
...@@ -349,7 +349,7 @@ def main(): ...@@ -349,7 +349,7 @@ def main():
) )
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slightly different for training and evaluation.
if training_args.do_train or model_args.do_calib: if training_args.do_train or model_args.do_calib:
column_names = raw_datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval or model_args.save_onnx: elif training_args.do_eval or model_args.save_onnx:
...@@ -448,7 +448,7 @@ def main(): ...@@ -448,7 +448,7 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if argument is specified
max_train_samples = min(len(train_dataset), data_args.max_train_samples) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples)) train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset # Create train feature from dataset
......
...@@ -239,7 +239,7 @@ For example, ...@@ -239,7 +239,7 @@ For example,
./save_len_file.py Helsinki-NLP/opus-mt-en-ro wmt_en_ro ./save_len_file.py Helsinki-NLP/opus-mt-en-ro wmt_en_ro
./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs ./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
``` ```
splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100. splits `wmt_en_ro/train` into 11,197 uneven length batches and can finish 1 epoch in 8 minutes on a v100.
For comparison, For comparison,
```bash ```bash
......
...@@ -94,7 +94,7 @@ def run_generate(verbose=True): ...@@ -94,7 +94,7 @@ def run_generate(verbose=True):
parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics") parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.") parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
parser.add_argument( parser.add_argument(
"--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples" "--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
) )
parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics") parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
parser.add_argument("--bs", type=int, default=8, required=False, help="batch size") parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
......
...@@ -69,12 +69,12 @@ class ModelArguments: ...@@ -69,12 +69,12 @@ class ModelArguments:
hidden_dropout: Optional[float] = field( hidden_dropout: Optional[float] = field(
default=0.1, default=0.1,
metadata={ metadata={
"help": "The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler." "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
}, },
) )
feat_proj_dropout: Optional[float] = field( feat_proj_dropout: Optional[float] = field(
default=0.1, default=0.1,
metadata={"help": "The dropout probabilitiy for all 1D convolutional layers in feature extractor."}, metadata={"help": "The dropout probability for all 1D convolutional layers in feature extractor."},
) )
mask_time_prob: Optional[float] = field( mask_time_prob: Optional[float] = field(
default=0.05, default=0.05,
......
...@@ -311,7 +311,7 @@ def main(): ...@@ -311,7 +311,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# 3. Detecting last checkpoint and eventualy continue from last checkpoint # 3. Detecting last checkpoint and eventually continue from last checkpoint
last_checkpoint = None last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
......
...@@ -107,10 +107,10 @@ from datasets import load_dataset ...@@ -107,10 +107,10 @@ from datasets import load_dataset
# example 1: local folder # example 1: local folder
dataset = load_dataset("imagefolder", data_dir="path_to_your_folder") dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
# example 2: local files (suppoted formats are tar, gzip, zip, xz, rar, zstd) # example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
dataset = load_dataset("imagefolder", data_files="path_to_zip_file") dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
# example 3: remote files (suppoted formats are tar, gzip, zip, xz, rar, zstd) # example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip") dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
# example 4: providing several splits # example 4: providing several splits
......
...@@ -109,7 +109,7 @@ def main(args): ...@@ -109,7 +109,7 @@ def main(args):
tokenizer.decoder = decoders.Metaspace() tokenizer.decoder = decoders.Metaspace()
if args.export_to_hub: if args.export_to_hub:
logger.info("Exporting the trained tokenzier to Hub.") logger.info("Exporting the trained tokenizer to Hub.")
new_tokenizer = AlbertTokenizerFast(tokenizer_object=tokenizer) new_tokenizer = AlbertTokenizerFast(tokenizer_object=tokenizer)
new_tokenizer.push_to_hub("unigram-tokenizer-dataset") new_tokenizer.push_to_hub("unigram-tokenizer-dataset")
......
...@@ -512,7 +512,7 @@ def main(): ...@@ -512,7 +512,7 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if argument is specified
max_train_samples = min(len(train_dataset), data_args.max_train_samples) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples)) train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset # Create train feature from dataset
......
...@@ -211,7 +211,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin): ...@@ -211,7 +211,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
self.input_shape = input_shape self.input_shape = input_shape
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
# To check if the model was intialized automatically. # To check if the model was initialized automatically.
self._is_initialized = _do_init self._is_initialized = _do_init
if _do_init: if _do_init:
......
...@@ -3736,7 +3736,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix ...@@ -3736,7 +3736,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
else: else:
hf_quantizer.create_quantized_param(model, value, key, "cpu", state_dict) hf_quantizer.create_quantized_param(model, value, key, "cpu", state_dict)
# retrieve unintialized modules and initialize before maybe overriding that with the pretrained weights. # retrieve uninitialized modules and initialize before maybe overriding that with the pretrained weights.
if _fast_init: if _fast_init:
if not ignore_mismatched_sizes: if not ignore_mismatched_sizes:
if remove_prefix_from_model: if remove_prefix_from_model:
......
...@@ -58,7 +58,7 @@ class BigBirdConfig(PretrainedConfig): ...@@ -58,7 +58,7 @@ class BigBirdConfig(PretrainedConfig):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported. `"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1): hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 4096): max_position_embeddings (`int`, *optional*, defaults to 4096):
......
...@@ -53,7 +53,7 @@ class BioGptConfig(PretrainedConfig): ...@@ -53,7 +53,7 @@ class BioGptConfig(PretrainedConfig):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported. `"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1): hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 1024): max_position_embeddings (`int`, *optional*, defaults to 1024):
......
...@@ -50,7 +50,7 @@ class CanineConfig(PretrainedConfig): ...@@ -50,7 +50,7 @@ class CanineConfig(PretrainedConfig):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported. `"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1): hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoders, and pooler. The dropout probability for all fully connected layers in the embeddings, encoders, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 16384): max_position_embeddings (`int`, *optional*, defaults to 16384):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment