"git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "aa31740b1701dd389ac76ae309a2ce1c449c6a59"
Unverified Commit 512a0d4d authored by Olatunji Ruwase's avatar Olatunji Ruwase Committed by GitHub
Browse files

Fix index out of range error when parameter count is not multiple of ranks (#202)

parent c014a55b
...@@ -264,6 +264,13 @@ class FP16_DeepSpeedZeroOptimizer(object): ...@@ -264,6 +264,13 @@ class FP16_DeepSpeedZeroOptimizer(object):
flat_tensor_list = [] flat_tensor_list = []
current_size = 0 current_size = 0
if not tensor_list:
flat_tensor_list.append(
torch.zeros(int(partition_size),
dtype=dtype,
device=torch.cuda.current_device()))
return _flatten_dense_tensors(flat_tensor_list)
if dtype is None: if dtype is None:
dtype = tensor_list[0].dtype dtype = tensor_list[0].dtype
......
...@@ -315,3 +315,33 @@ def test_zero_allow_untested_optimizer(tmpdir): ...@@ -315,3 +315,33 @@ def test_zero_allow_untested_optimizer(tmpdir):
model_parameters=model.parameters()) model_parameters=model.parameters())
_test_zero_allow_untested_optimizer(args) _test_zero_allow_untested_optimizer(args)
def test_zero_empty_partition(tmpdir):
config_dict = {
"train_batch_size": 3,
"fp16": {
"enabled": True
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"zero_optimization": True
}
args = args_from_dict(tmpdir, config_dict)
@distributed_test(world_size=[3])
def _test_zero_empty_partition(args):
hidden_dim = 1
model = SimpleModel(hidden_dim)
# Ensure model has 2 parameters, to cause empty partition with DP=3
assert len(list(model.parameters())) == 2
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model.step()
_test_zero_empty_partition(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment