Unverified Commit 3059d80d authored by Olatunji Ruwase's avatar Olatunji Ruwase Committed by GitHub
Browse files

[DeepSpeed ZeRO3] Fix performance degradation in sharded models (#18911)



* [DeepSpeed] Fix performance degradation in sharded models

* style

* polish
Co-authored-by: default avatarStas Bekman <stas@stason.org>
parent 10c774cf
...@@ -421,12 +421,17 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix): ...@@ -421,12 +421,17 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
if is_deepspeed_zero3_enabled(): if is_deepspeed_zero3_enabled():
import deepspeed import deepspeed
# because zero3 puts placeholders in model params, this context # In sharded models, each shard has only part of the full state_dict, so only gather
# manager gathers (unpartitions) the params of the current layer, then loads from # parameters that are in the current state_dict.
# the state dict and then re-partitions them again named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0): params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
if torch.distributed.get_rank() == 0: if len(params_to_gather) > 0:
module._load_from_state_dict(*args) # because zero3 puts placeholders in model params, this context
# manager gathers (unpartitions) the params of the current layer, then loads from
# the state dict and then re-partitions them again
with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
if torch.distributed.get_rank() == 0:
module._load_from_state_dict(*args)
else: else:
module._load_from_state_dict(*args) module._load_from_state_dict(*args)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment