fix incorrect sharding without zero (#5545)

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
7e0ec5a8 · Edenzzzz · GitHub · e614aa34 · 7e0ec5a8
Unverified Commit 7e0ec5a8 authored Apr 02, 2024 by Edenzzzz Committed by GitHub Apr 02, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

colossalai/shardformer/shard/shard_config.py colossalai/shardformer/shard/shard_config.py +4 -2

No files found.
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -74,8 +74,10 @@ class ShardConfig:
        self.enable_fused_normalization = True
        self.enable_flash_attention = True
        self.enable_jit_fused = True
-        self.enable_sequence_parallelism = True
+        # This can cause non-in-place param sharding when used without ZeRO.
-        self.enable_sequence_overlap = True
+        # It may also slow down training when seq len is small. Plz enable manually.
+        # self.enable_sequence_parallelism = True
+        # self.enable_sequence_overlap = True
    def _infer(self):
        """