Unverified Commit 4c572f04 authored by Paul Gibbons's avatar Paul Gibbons Committed by GitHub
Browse files

[PyTorch Debug] Fix issue with start_end_list logging feature (#2252)



* fixes for start_end_list usage in TE debug
Signed-off-by: default avatarPaul Gibbons <pgibbons@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



---------
Signed-off-by: default avatarPaul Gibbons <pgibbons@nvidia.com>
Co-authored-by: default avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: default avatarPaweł Gadziński <62263673+pggPL@users.noreply.github.com>
parent fd2f589f
......@@ -290,10 +290,16 @@ class LogFp8TensorStats(BaseLogTensorStats):
for stat in config["stats"]:
self.check_if_stat_is_supported(stat, recipe_name)
start_step = config.get("start_step", None)
end_step = config.get("end_step", None)
start_end_list = config.get("start_end_list", None)
if start_end_list is not None:
start_end_list = tuple(tuple(int(x) for x in interval) for interval in start_end_list)
options = (
config.get("start_step", None),
config.get("end_step", None),
config.get("start_end_list", None),
start_step,
end_step,
start_end_list,
"fp8",
)
......
......@@ -130,10 +130,16 @@ class LogTensorStats(BaseLogTensorStats):
" log_tensor_stats. Use log_fp8_tensor_stats for FP8 tensors."
)
start_step = config.get("start_step", None)
end_step = config.get("end_step", None)
start_end_list = config.get("start_end_list", None)
if start_end_list is not None:
start_end_list = tuple(tuple(int(x) for x in interval) for interval in start_end_list)
options = (
config.get("start_step", None),
config.get("end_step", None),
config.get("start_end_list", None),
start_step,
end_step,
start_end_list,
)
skip_reduction, reduction_group, reduce_within_microbatch = get_reduction_params(
......
......@@ -172,11 +172,19 @@ class StatsBuffers:
if self.at_least_one_layer_fed:
return True
iteration = TEDebugState.get_iteration()
for _, next_iter in self.layers_to_next_iter.items():
layers_to_remove = []
for layer_name, next_iter in self.layers_to_next_iter.items():
# When next_iter is None the feature will no longer run.
if next_iter is None:
layers_to_remove.append(layer_name)
continue
# Note that layer can be not run for many iterations,
# in this case we will synchronize until every step until we get any information from it.
if iteration >= next_iter:
return True
for layer_name in layers_to_remove:
self.layers_to_next_iter.pop(layer_name, None)
return False
def reset(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment