"git@developer.sourcefind.cn:OpenDAS/pytorch3d.git" did not exist on "379c8b27803ce527387854ea9f7f612170a5ecbb"
Unverified Commit b1d4bd73 authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

fix for 16GB v100 nodes (#393)

parent 2dea61f2
...@@ -1585,6 +1585,7 @@ class FP16_DeepSpeedZeroOptimizer(object): ...@@ -1585,6 +1585,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
if self.cpu_offload: if self.cpu_offload:
torch.cuda.current_stream().wait_stream(self.migration_stream) torch.cuda.current_stream().wait_stream(self.migration_stream)
#TODO: we need to revist this and remove the magic 4.5x multiplier here
if self.contiguous_gradients: if self.contiguous_gradients:
self.ipg_buffer = [] self.ipg_buffer = []
buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5), buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
......
...@@ -445,7 +445,9 @@ def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload): ...@@ -445,7 +445,9 @@ def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
}, },
"zero_optimization": { "zero_optimization": {
"stage": zero_stage, "stage": zero_stage,
"cpu_offload": use_cpu_offload "cpu_offload": use_cpu_offload,
"reduce_bucket_size": 100,
"allgather_bucket_size": 100
} }
} }
args = args_from_dict(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment