# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. import os import re import glob import datetime from prettytable import PrettyTable from matplotlib import pyplot as plt NUM_MOST_RECENT_RUNS = 100 te_path = os.getenv("TE_PATH", "/opt/transformerengine") mlm_log_dir = os.path.join(te_path, "ci_logs") te_ci_log_dir = "/data/transformer_engine_ci_logs" te_ci_plot_dir = os.path.join(te_ci_log_dir, "plots") convergence_pattern = ( "validation loss at iteration \d* on validation set | lm loss" " value: ([\d.]*)E\+(\d*) | lm loss PPL: ([\d.]*)E\+(\d*)" ) perf_pattern = "elapsed time per iteration \(ms\): ([\d.]*)" def get_output_file(): now = datetime.datetime.now() default_fname = f"unknown_pipeline_id_{now.month}_{now.day}_{now.year}_{now.hour}_{now.minute}" fname = f"{os.getenv('CI_PIPELINE_ID', default_fname)}.txt" return os.path.join(te_ci_log_dir, fname) def get_run_metrics(filename): """Return the loss, perplexity, and step time for a given megatron-LM logfile.""" with open(filename, "r") as f: data = f.read() # Loss and PPL convergence_matches = re.findall(convergence_pattern, data) loss = round(float(convergence_matches[1][0]) * (10 ** int(convergence_matches[1][1])), 2) ppl = round(float(convergence_matches[2][2]) * (10 ** int(convergence_matches[2][3])), 2) step_times_str = re.findall(perf_pattern, data) step_times = [float(x) for x in step_times_str] avg_step_time = round(sum(step_times) / len(step_times), 2) return loss, ppl, avg_step_time def print_run_logs(): tables = [] raw_logs = [] for model_config in os.listdir(mlm_log_dir): model_config_dir = os.path.join(mlm_log_dir, model_config) table = PrettyTable() table.title = model_config table.field_names = ["Config", "Loss", "Perplexity", "Avg time per step (ms)"] for exp in os.listdir(model_config_dir): filename = os.path.join(model_config_dir, exp) loss, ppl, time_per_step = get_run_metrics(filename) exp_name = exp[:-4] table.add_row([exp_name, loss, ppl, time_per_step]) raw_logs.append(f"{model_config} {exp_name} {loss} {ppl} {time_per_step}\n") tables.append(table) with open(get_output_file(), "w") as f: for raw_log in raw_logs: f.write(raw_log) for table in tables: print(table) def save_plot(title, legend, data, filename, ylabel): x = list(range(1, len(data[0]) + 1)) plt.figure() for label, y in zip(legend, data): plt.plot(x, y, "-o", label=label) plt.title(title) plt.legend() plt.xlabel(f"Last {NUM_MOST_RECENT_RUNS} runs") plt.ylabel(ylabel) plt.savefig(os.path.join(te_ci_plot_dir, filename)) def perf_and_loss_plots(): files = glob.glob(os.path.join(te_ci_log_dir, "*.txt")) files.sort(key=os.path.getctime) files = files[-NUM_MOST_RECENT_RUNS:] data = {} for filename in files: with open(filename) as file: for line in file: line = line.strip() model_config, exp_name, loss, _, time_per_step = line.split(" ") if model_config not in data: data[model_config] = {} if exp_name not in data[model_config]: data[model_config][exp_name] = {"loss": [], "perf": []} data[model_config][exp_name]["loss"].append(float(loss)) data[model_config][exp_name]["perf"].append(float(time_per_step)) for model_config, experiments in data.items(): lm_loss_data = [] lm_perf_data = [] legend = [] for exp_name, lm_data in experiments.items(): legend.append(exp_name) lm_loss_data.append(lm_data["loss"]) lm_perf_data.append(lm_data["perf"]) save_plot( model_config + " loss", legend, lm_loss_data, model_config + "_loss.png", "LM-Loss", ) save_plot( model_config + " perf", legend, lm_perf_data, model_config + "_perf.png", "Time per step (ms)", ) if __name__ == "__main__": print_run_logs() perf_and_loss_plots()