chore: Fix multiple typos (#617)

Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>

chore: Fix multiple typos (#617)
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
e4f506a0 · hugo-syn · GitHub · 051db0d7 · e4f506a0 · e4f506a0
Unverified Commit e4f506a0 authored Jan 19, 2024 by hugo-syn Committed by GitHub Jan 19, 2024
6 changed files
--- a/examples/pytorch/fsdp/README.md
+++ b/examples/pytorch/fsdp/README.md
@@ -37,7 +37,7 @@ $ torchrun --standalone --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) fsd
 #    [GPU-0] Peak memory use = 3000MiB

 # FSDP with deferred initialization:
-#    Modules initialized with empty paramaters via `device='meta'` option. Zero load on device
+#    Modules initialized with empty parameters via `device='meta'` option. Zero load on device
 #    memory until torch.distributed.fsdp.FullyShardedDataParallel mode triggers a reset on
 #    on already sharded model parameters.
 $ torchrun --standalone --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) fsdp.py --defer-init

--- a/tests/jax/test_fused_attn.py
+++ b/tests/jax/test_fused_attn.py
@@ -250,7 +250,7 @@ class FusedAttnRunner:
        self._setup_inputs()

        def grad_func(func, *args, **kwargs):
-            # Gradient is small, use a gradient multiplier to amplify the graident
+            # Gradient is small, use a gradient multiplier to amplify the gradient
            gradient_multiplier = self.valid_len_q * self.num_heads_q
            if is_causal_mask(self.attn_mask_type):
                gradient_multiplier /= 10

--- a/tests/jax/test_helper.py
+++ b/tests/jax/test_helper.py
@@ -204,7 +204,7 @@ class TestFP8Functions(unittest.TestCase):
            (MeshResource(None, 'tp')),
            (MeshResource('dp', 'tp')),
        )
-        # TODO (Ming Huang): Suport multi-GPUs testing. # pylint: disable=fixme
+        # TODO (Ming Huang): Support multi-GPUs testing. # pylint: disable=fixme
        mesh_shape = (1, 1)
        devices = np.asarray(jax.devices()[:1]).reshape(*mesh_shape)
        with jax.sharding.Mesh(devices, ('dp', 'tp')):

--- a/tests/paddle/parallel_tests/attention_tp.py
+++ b/tests/paddle/parallel_tests/attention_tp.py
@@ -100,7 +100,7 @@ class TestAttentionTp(unittest.TestCase):
            paddle.distributed.all_gather(total_weight, partial_weight, group=tp_group)
            if interleave:
                # Due to the interleaved qkv layout, need to concat on num_head
-                # dimention for column parallel linear in MultiHeadAttention layer
+                # dimension for column parallel linear in MultiHeadAttention layer
                assert axis == 0
                assert [3 * self.hidden_size // self.world_size,
                        self.hidden_size] == partial_weight.shape

--- a/tests/paddle/parallel_tests/transformer_tp.py
+++ b/tests/paddle/parallel_tests/transformer_tp.py
@@ -101,7 +101,7 @@ class TestTransformerTp(unittest.TestCase):
            paddle.distributed.all_gather(total_weight, partial_weight, group=tp_group)
            if interleave:
                # Due to the interleaved qkv layout, need to concat on num_head
-                # dimention for column parallel linear in MultiHeadAttention layer
+                # dimension for column parallel linear in MultiHeadAttention layer
                assert axis == 0
                assert [3 * self.hidden_size // self.world_size,
                        self.hidden_size] == partial_weight.shape

--- a/tests/pytorch/fused_attn/test_fused_attn.py
+++ b/tests/pytorch/fused_attn/test_fused_attn.py
@@ -668,7 +668,7 @@ def test_transformer_layer(dtype, model_configs, model, ckpt_attn, qkv_format, f
 @pytest.mark.parametrize("model", ["te_1_2", "te_2_0"])
 @pytest.mark.parametrize("qkv_format", ["bshd", "sbhd"])
 def test_te_layer_misc(dtype, model_configs, model, qkv_format):
-    """Test TransformerLayer module with miscellanous settings"""
+    """Test TransformerLayer module with miscellaneous settings"""
    ckpt_attn = True
    fused_qkv_params = True
    RoPE = True