Unverified Commit a937425e authored by Alessio Serra's avatar Alessio Serra Committed by GitHub
Browse files

Prevent MLflow exception from disrupting training (#28779)



Modified MLflow logging metrics from synchronous to asynchronous
Co-authored-by: default avatarcodiceSpaghetti <alessio.ser@hotmail.it>
parent d703eaae
...@@ -1036,7 +1036,7 @@ class MLflowCallback(TrainerCallback): ...@@ -1036,7 +1036,7 @@ class MLflowCallback(TrainerCallback):
f'Trainer is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. ' f'Trainer is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. '
"MLflow's log_metric() only accepts float and int types so we dropped this attribute." "MLflow's log_metric() only accepts float and int types so we dropped this attribute."
) )
self._ml_flow.log_metrics(metrics=metrics, step=state.global_step) self._ml_flow.log_metrics(metrics=metrics, step=state.global_step, synchronous=False)
def on_train_end(self, args, state, control, **kwargs): def on_train_end(self, args, state, control, **kwargs):
if self._initialized and state.is_world_process_zero: if self._initialized and state.is_world_process_zero:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment