[zero] ZeRO supports pipeline parallel (#477)

8d3250d7 · ver217 · GitHub · 7f5e4592 · 8d3250d7 · 8d3250d7
Unverified Commit 8d3250d7 authored Mar 21, 2022 by ver217 Committed by GitHub Mar 21, 2022
3 changed files
--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
 #!/usr/bin/env python
-import torch.distributed as dist
+from collections import defaultdict
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+import torch
+import torch.distributed as dist
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from ._base_gradient_handler import BaseGradientHandler
-from collections import defaultdict
 @GRADIENT_HANDLER.register_module
@@ -35,7 +37,7 @@ class PipelineSharedModuleGradientHandler(BaseGradientHandler):
            for group, group_buckets in buckets.items():
                for tp, bucket in group_buckets.items():
                    grads = [param.grad.data for param in bucket]
-                    coalesced = _flatten_dense_tensors(grads)
+                    coalesced = _flatten_dense_tensors(grads).to(torch.cuda.current_device())
                    dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
--- a/colossalai/zero/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
@@ -262,3 +262,15 @@ class ShardedModelV2(nn.Module):
    def load_state_dict(self, state_dict: 'OrderedDict[str, torch.Tensor]', strict: bool = True):
        raise NotImplementedError
+    def __getitem__(self, idx: int):
+        assert isinstance(self.module, nn.ModuleList)
+        return self.module[idx]
+    def __len__(self):
+        assert isinstance(self.module, nn.ModuleList)
+        return len(self.module)
+    def __iter__(self):
+        assert isinstance(self.module, nn.ModuleList)
+        return iter(self.module)