Commit b49ea81c authored by Spencer Poff's avatar Spencer Poff Committed by Facebook Github Bot
Browse files

check save_dir before beginning training

Summary: I sadly discovery that my checkpoint directory wasn't globally readable after 8 hours of training. Adding this check at the beginning of train loop to keep that from happening again!

Reviewed By: myleott

Differential Revision: D16455394

fbshipit-source-id: 35959aa058150b2afb63710c468d01ebc8a12b0c
parent 208295df
......@@ -358,3 +358,15 @@ def load_pretrained_component_from_model(
component_state_dict[component_subkey] = state["model"][key]
component.load_state_dict(component_state_dict, strict=True)
return component
def verify_checkpoint_directory(save_dir: str) -> None:
temp_file_path = os.path.join(save_dir, 'dummy')
try:
with open(temp_file_path, 'w'):
pass
except OSError as e:
print('| Unable to access checkpoint save directory: {}'.format(save_dir))
raise e
else:
os.remove(temp_file_path)
......@@ -35,6 +35,9 @@ def main(args, init_distributed=False):
if init_distributed:
args.distributed_rank = distributed_utils.distributed_init(args)
if distributed_utils.is_master(args):
checkpoint_utils.verify_checkpoint_directory(args.save_dir)
# Print args
print(args)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment