Unverified Commit 1f972427 authored by Arash Ashari's avatar Arash Ashari Committed by GitHub
Browse files

only global rank 0 can log tensorboard data; avoid multi gpu/node race for the log directory (#296)

parent 376818ef
...@@ -144,12 +144,12 @@ class DeepSpeedLight(Module): ...@@ -144,12 +144,12 @@ class DeepSpeedLight(Module):
self._configure_with_arguments(args, mpu) self._configure_with_arguments(args, mpu)
self._do_sanity_check() self._do_sanity_check()
self._init_distributed(dist_init_required)
self.sample_count = 0 self.sample_count = 0
if self.tensorboard_enabled(): if self.tensorboard_enabled() and self.global_rank == 0:
self.summary_writer = self.get_summary_writer() self.summary_writer = self.get_summary_writer()
self._init_distributed(dist_init_required)
# Configure distributed model # Configure distributed model
self._configure_distributed_model(model) self._configure_distributed_model(model)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment