# Seems that `reduce_scatter` need contiguous tensors: https://github.com/pytorch/pytorch/blob/2b267fa7f28e18ca6ea1de4201d2541a40411457/torch/distributed/nn/functional.py#L305
# We set grad_input to be contiguous in case it isn't already.
f"[S3] Successfully removed {len(local_files)} local files ({all_removed}) from {self.local_path} (uploaded to {self.s3_path_direct_link}) in {total_time}"
)
returnsuccess
defupdate(self)->(str,str):
"""Update the state of the mover: UPLOADING => REMOVING_DUPLICATED => DUPLICATING => REMOVING_CHECKPOINT => IDLE
Returns:
(str, str): The state and the stdout of the process if any
"""
ifself.processisNone:
self._reset_state()
returnself.state,self.stdout
return_code=self.process.poll()
ifreturn_codeisNone:
# Still running
returnself.state,self.stdout
ifreturn_code!=0:
self.get_current_stdout()
self._warning(
f"[S3] Error running command {self.cmd} during process {self.state.value}, "
f"Process rank {dist.get_rank(parallel_context.world_pg)}/{parallel_context.world_pg.size()}: {name} still has gradient despite having ran the optimizer",
msg=lambdamsg:f"tensor at {current_state_dict['names'][index]} doesn't match with our reference. Optimizer key: {name}\nCur: {tensor}\nRef: {reference_tensor}\n{msg}",
"You are resuming in a different PP size, so optimizer states need to be checked. Feel free to open a PR if you work on this!"
)
assert(
param_shard_metadataisnotNone
),f"You have to pass how the original parameters are sharded in order to resume in a different tensor parallel size, ckp_tp_size: {ckp_tp_size}, current tp_size: {parallel_context.tp_pg.size()}"
assert(
modelisnotNone
),"You have to pass the model in order to adjust the optimizer states according to how the current parameters are sharded"