Samyamr/cpu memory bloat fix zero (#233)

* Fix for CPU memory Bloating Issue caused by pyorch backward graph creation in allgather. Fixed by calling detach on tensors before calling all_gather * Fix for CPU memory Bloating Issue caused by pyorch backward graph creation in allgather. Fixed by calling detach on tensors before calling all_gather * Fix for CPU memory Bloating Issue caused by pyorch backward graph creation in allgather. Fixed by calling detach on tensors before calling all_gather

Samyamr/cpu memory bloat fix zero (#233)
* Fix for CPU memory Bloating Issue caused by pyorch backward graph creation in allgather. Fixed by calling detach on tensors before calling all_gather * Fix for CPU memory Bloating Issue caused by pyorch backward graph creation in allgather. Fixed by calling detach on tensors before calling all_gather * Fix for CPU memory Bloating Issue caused by pyorch backward graph creation in allgather. Fixed by calling detach on tensors before calling all_gather
d24d3de9 · Samyam Rajbhandari · GitHub · abe2204d · d24d3de9 · d24d3de9
Unverified Commit d24d3de9 authored May 27, 2020 by Samyam Rajbhandari Committed by GitHub May 27, 2020
Show whitespace changes
Inline Side-by-side

Showing with 23 additions and 30 deletions

deepspeed/pt/deepspeed_zero_optimizer.py deepspeed/pt/deepspeed_zero_optimizer.py +22 -29

deepspeed/pt/zero_optimizer_stage1.py deepspeed/pt/zero_optimizer_stage1.py +1 -1

No files found.
--- a/deepspeed/pt/deepspeed_zero_optimizer.py
+++ b/deepspeed/pt/deepspeed_zero_optimizer.py
@@ -1112,23 +1112,15 @@ class FP16_DeepSpeedZeroOptimizer(object):
                1,
                partitioned_params[partition_id].numel() * dp_world_size //
                self.allgather_bucket_size)
-            if num_shards == 1:
-                dist.all_gather(partitioned_params,
-                                partitioned_params[partition_id],
-                                group=self.dp_process_group)
-            else:
            shard_size = partitioned_params[partition_id].numel() // num_shards
            num_elements = shard_size
+            assert shard_size * num_shards <= partitioned_params[partition_id].numel()
            for shard_id in range(num_shards):
-                    #boundary condition
-                    #TODO: Check correctness of boundary condition
                if shard_id == (num_shards - 1):
-                        if shard_size * num_shards >= partitioned_params[
-                                partition_id].numel():
-                            break
-                        else:
                    num_elements = partitioned_params[partition_id].numel(
                    ) - shard_id * shard_size
@@ -1137,8 +1129,9 @@ class FP16_DeepSpeedZeroOptimizer(object):
                    curr_shard = partitioned_params[dp_id].narrow(
                        0,
                        shard_id * shard_size,
-                            num_elements)
+                        num_elements).detach()
                    shard_list.append(curr_shard)
                dist.all_gather(shard_list,
                                shard_list[partition_id],
                                group=self.dp_process_group)

--- a/deepspeed/pt/zero_optimizer_stage1.py
+++ b/deepspeed/pt/zero_optimizer_stage1.py
@@ -331,7 +331,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage1(object):
            list)  # [rank] -> [(start,end), (start,end), ...]
        for idx in range(num_sub_partitions):
            rank_id = idx % world_size
-            sub_partition = tensor.narrow(0, start, sub_partition_size)
+            sub_partition = tensor.narrow(0, start, sub_partition_size).detach()
            element_intervals[rank_id].append((start, start + sub_partition_size))
            comm_partitions[comm_id].append(sub_partition)
            start = start + sub_partition_size