Unverified Commit 645f1742 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Exit early in load if no weights are in the sharded state dict (#18937)

parent 660e0b97
...@@ -418,22 +418,25 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix): ...@@ -418,22 +418,25 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
def load(module: nn.Module, state_dict, prefix=""): def load(module: nn.Module, state_dict, prefix=""):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
args = (state_dict, prefix, local_metadata, True, [], [], error_msgs) args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
if is_deepspeed_zero3_enabled(): # Parameters of module and children will start with prefix. We can exit early if there are none in this
import deepspeed # state_dict
if len([key for key in state_dict if key.startswith(prefix)]) > 0:
# In sharded models, each shard has only part of the full state_dict, so only gather if is_deepspeed_zero3_enabled():
# parameters that are in the current state_dict. import deepspeed
named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters] # In sharded models, each shard has only part of the full state_dict, so only gather
if len(params_to_gather) > 0: # parameters that are in the current state_dict.
# because zero3 puts placeholders in model params, this context named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
# manager gathers (unpartitions) the params of the current layer, then loads from params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
# the state dict and then re-partitions them again if len(params_to_gather) > 0:
with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0): # because zero3 puts placeholders in model params, this context
if torch.distributed.get_rank() == 0: # manager gathers (unpartitions) the params of the current layer, then loads from
module._load_from_state_dict(*args) # the state dict and then re-partitions them again
else: with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
module._load_from_state_dict(*args) if torch.distributed.get_rank() == 0:
module._load_from_state_dict(*args)
else:
module._load_from_state_dict(*args)
for name, child in module._modules.items(): for name, child in module._modules.items():
if child is not None: if child is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment