[nfc] fix typo colossalai/cli fx kernel (#3847)

* fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel

[nfc] fix typo colossalai/cli fx kernel (#3847)
* fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel
70c8cdec · digger yu · GitHub · 281b33f3 · 70c8cdec · 70c8cdec
Unverified Commit 70c8cdec authored Jun 02, 2023 by digger yu Committed by GitHub Jun 02, 2023
11 changed files
--- a/colossalai/cli/launcher/__init__.py
+++ b/colossalai/cli/launcher/__init__.py
@@ -28,7 +28,7 @@ from .run import launch_multi_processes
    type=str,
    default=None,
    help=
-    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
+    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include,"
    " only effective when used with --hostfile.")
 @click.option("--num_nodes",
              type=int,

--- a/colossalai/cli/launcher/hostinfo.py
+++ b/colossalai/cli/launcher/hostinfo.py
@@ -38,7 +38,7 @@ class HostInfo:

        # socket.getfqdn("127.0.0.1") does not return localhost
        # on some users' machines
-        # thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
+        # thus, we directly return True if hostname is localhost, 127.0.0.1 or 0.0.0.0
        if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
            return True


--- a/colossalai/cli/launcher/multinode_runner.py
+++ b/colossalai/cli/launcher/multinode_runner.py
@@ -114,7 +114,7 @@ class MultiNodeRunner:
        Receive messages from all hosts

        Returns:
-            msg_from_node (dict): a dictionry which contains messages from each node
+            msg_from_node (dict): a dictionary which contains messages from each node
        """

        msg_from_node = dict()

--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -298,7 +298,7 @@ def launch_multi_processes(args: Config) -> None:
    # receive the stop status
    msg_from_node = runner.recv_from_all()

-    # printe node status
+    # print node status
    click.echo("\n====== Stopping All Nodes =====")
    for hostname, msg in msg_from_node.items():
        click.echo(f"{hostname}: {msg}")

--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -197,7 +197,7 @@ class AlphaBetaProfiler:
            dist.broadcast_object_list(broadcast_list, src=process_group[0])
            alpha_beta_dict[process_group] = tuple(broadcast_list)

-        # add symmetry pair to the apha_beta_dict
+        # add symmetry pair to the alpha_beta_dict
        symmetry_ab_dict = {}
        for process_group, alpha_beta_pair in alpha_beta_dict.items():
            symmetry_process_group = (process_group[1], process_group[0])

--- a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
+++ b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
@@ -51,7 +51,7 @@ class BiasAdditionModule(ABC):

        For example:
            The kwargs for conv2d module is {} because the attributes like 'padding' or 'groups' are
-            considered during module initilizing. However, we need to consider those attributes as kwargs
+            considered during module initializing. However, we need to consider those attributes as kwargs
            in F.conv2d.
        """
        pass

--- a/colossalai/fx/tracer/experimental.py
+++ b/colossalai/fx/tracer/experimental.py
@@ -295,7 +295,7 @@ class ColoTracer(Tracer):

                @staticmethod
                def forward(ctx, run_function, preserve_rng_state, *args):
-                    # signal that the current tracing occurs within activaton checkpoint part
+                    # signal that the current tracing occurs within activation checkpoint part
                    self.inside_torch_checkpoint_func = True
                    out = run_function(*args)
                    self.inside_torch_checkpoint_func = False

--- a/colossalai/fx/tracer/tracer.py
+++ b/colossalai/fx/tracer/tracer.py
@@ -92,7 +92,7 @@ class ColoTracer(Tracer):
            return proxy

        # if graph is traced for auto parallelism module, some extra node will be added during
-        # graph construction to deal with the compatability between bias addition and all reduce.
+        # graph construction to deal with the compatibility between bias addition and all reduce.

        # if no extra manipulation is applied, we just pass the origin arguments to create_proxy function
        # to create node on computation graph
@@ -208,7 +208,7 @@ class ColoTracer(Tracer):
            self.proxy_cls = ColoProxy
            self.tracer_type = TracerType.META
        else:
-            raise ValueError(f"Unrecognised tracer type {tracer_type}")
+            raise ValueError(f"Unrecognized tracer type {tracer_type}")

    def _meta_data_computing(self, kind, target, args, kwargs):

@@ -445,7 +445,7 @@ class ColoTracer(Tracer):

                @staticmethod
                def forward(ctx, run_function, preserve_rng_state, *args):
-                    # signal that the current tracing occurs within activaton checkpoint part
+                    # signal that the current tracing occurs within activation checkpoint part
                    self.inside_torch_checkpoint_func = True
                    out = run_function(*args)
                    self.inside_torch_checkpoint_func = False

--- a/colossalai/kernel/cuda_native/flash_attention.py
+++ b/colossalai/kernel/cuda_native/flash_attention.py
@@ -138,7 +138,7 @@ if HAS_MEM_EFF_ATTN:
            elif attn_mask_type == AttnMaskType.causal:    # gpt style
                attn_bias = LowerTriangularMask()

-            if bias is not None:    # alibi / relative position emebedding
+            if bias is not None:    # alibi / relative position embedding
                assert allow_alibi, "flash attention with bias is not supported in this system."
                assert attn_mask_type == AttnMaskType.causal, \
                    "attention with bias is only supported for causal attention so far."

--- a/colossalai/kernel/cuda_native/multihead_attention.py
+++ b/colossalai/kernel/cuda_native/multihead_attention.py
@@ -43,7 +43,7 @@ class Config:
    attn_prob_dropout_ratio: float    # attention score dropout ratio
    hidden_dropout_ratio: float    # dropout ration before residual
    norm_first: bool    # norm_first
-    fp16: bool    # fp16 presion
+    fp16: bool    # fp16 precision


 class MultiHeadAttention1DFunc(Function):

--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@@ -43,7 +43,7 @@ def warmup_jit_fusion(batch_size: int,
                      seq_length: int = 512,
                      vocab_size: int = 32768,
                      dtype: torch.dtype = torch.float32):
-    """ Compilie JIT functions before the main training steps """
+    """ Compile JIT functions before the main training steps """

    embed = Embedding(vocab_size, hidden_size).to(get_current_device())
    linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_current_device())