fix typo examples/community/roberta (#3925)

407aa484 · digger yu · GitHub · e417dd00 · 407aa484 · 407aa484
Unverified Commit 407aa484 authored Jun 08, 2023 by digger yu Committed by GitHub Jun 08, 2023
8 changed files
--- a/examples/community/roberta/README.md
+++ b/examples/community/roberta/README.md
@@ -44,7 +44,7 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
 ## 3. Finetune
-The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
+The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transformers from Hugging Face to finetune downstream application.
 ## Contributors
 The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
--- a/examples/community/roberta/preprocessing/README.md
+++ b/examples/community/roberta/preprocessing/README.md
@@ -25,10 +25,10 @@ Firstly, each file has multiple documents, and each document contains multiple s
 In this example, split 200G Corpus into 100 shard, and each shard is about 2G. The size of the shard is memory-dependent, taking into account the number of servers, the memory used by the tokenizer, and the memory used by the multi-process training to read the shard (n data parallel requires n\*shard_size memory). **To sum up, data preprocessing and model pretraining requires fighting with hardware, not just GPU.**
 ```python
-python sentence_split.py --input_path /orginal_corpus --output_path /shard --shard 100
+python sentence_split.py --input_path /original_corpus --output_path /shard --shard 100
 # This step takes a short time
 ```
-* `--input_path`: all original corpus, e.g., /orginal_corpus/0.json /orginal_corpus/1.json ...
+* `--input_path`: all original corpus, e.g., /original_corpus/0.json /original_corpus/1.json ...
 * `--output_path`: all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
 * `--shard`: Number of shard, e.g., 10, 50, or 100
@@ -76,7 +76,7 @@ make
 * `--input_path`: location of all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
 * `--output_path`: location of all h5 with token_id, input_mask, segment_ids and masked_lm_positions, e.g., /h5/0.h5, /h5/1.h5 ...
-* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenzier.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
+* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenizer.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
 * `--backend`: python or c++, **specifies c++ can obtain faster preprocess speed**
 * `--dupe_factor`: specifies how many times the preprocessor repeats to create the input from the same article/document
 * `--worker`: number of process

--- a/examples/community/roberta/pretraining/README.md
+++ b/examples/community/roberta/pretraining/README.md
@@ -13,7 +13,7 @@ bash run_pretrain.sh
 * `--bert_config`: config.json which represent model
 * `--mlm`: model type of backbone, bert or deberta_v2
-2. if resume training from earylier checkpoint, run the script below.
+2. if resume training from earlier checkpoint, run the script below.
 ```shell
 bash run_pretrain_resume.sh

--- a/examples/community/roberta/pretraining/arguments.py
+++ b/examples/community/roberta/pretraining/arguments.py
@@ -46,7 +46,7 @@ def parse_args():
                        type=int,
                        default=1,
                        help="This param makes sure that a certain task is repeated for this time steps to \
-        optimise on the back propogation speed with APEX's DistributedDataParallel")
+        optimize on the back propagation speed with APEX's DistributedDataParallel")
    parser.add_argument("--max_predictions_per_seq",
                        "--max_pred",
                        default=80,
@@ -73,12 +73,12 @@ def parse_args():
                        help="location of saving checkpoint, which contains model and optimizer")
    parser.add_argument('--seed', type=int, default=42, help="random seed for initialization")
    parser.add_argument('--vscode_debug', action='store_true', help="use vscode to debug")
-    parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoin")
+    parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoint")
    parser.add_argument(
        '--load_optimizer_lr',
        default='',
        type=str,
-        help="location of checkpoint, which contains optimerzier, learning rate, epoch, shard and global_step")
+        help="location of checkpoint, which contains optimizer, learning rate, epoch, shard and global_step")
    parser.add_argument('--resume_train', action='store_true', help="whether resume training from a early checkpoint")
    parser.add_argument('--mlm', default='bert', type=str, help="model type, bert or deberta")
    parser.add_argument('--checkpoint_activations', action='store_true', help="whether to use gradient checkpointing")

--- a/examples/community/roberta/pretraining/model/bert.py
+++ b/examples/community/roberta/pretraining/model/bert.py
@@ -327,7 +327,7 @@ class BertSelfAttention(nn.Module):
                attention_scores = attention_scores + relative_position_scores
            elif self.position_embedding_type == "relative_key_query":
                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhld,lrd->bhlr", key_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

--- a/examples/community/roberta/pretraining/run_pretraining.py
+++ b/examples/community/roberta/pretraining/run_pretraining.py
@@ -78,7 +78,7 @@ def main():
                             default_pg=shard_pg):
            config, model, numel = get_model(args, logger)
-        # asign running configurations
+        # assign running configurations
        gemini_config = None
        if args.distplan.startswith("CAI_ZeRO"):
            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)

--- a/examples/community/roberta/pretraining/utils/exp_util.py
+++ b/examples/community/roberta/pretraining/utils/exp_util.py
@@ -97,7 +97,7 @@ def throughput_calculator(numel, args, config, iteration_time, total_iterations,
 def synchronize():
    if not torch.distributed.is_available():
        return
-    if not torch.distributed.is_intialized():
+    if not torch.distributed.is_initialized():
        return
    world_size = torch.distributed.get_world_size()
    if world_size == 1:

--- a/examples/community/roberta/pretraining/utils/global_vars.py
+++ b/examples/community/roberta/pretraining/utils/global_vars.py
@@ -110,7 +110,7 @@ class Timers:
        """Write timers to a tensorboard writer"""
        # currently when using add_scalars,
        # torch.utils.add_scalars makes each timer its own run, which
-        # polutes the runs list, so we just add each as a scalar
+        # pollutes the runs list, so we just add each as a scalar
        assert normalizer > 0.0
        for name in names:
            value = self.timers[name].elapsed(reset=reset) / normalizer