# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import pandas as pd import sys import matplotlib.pyplot as plt from matplotlib.lines import Line2D def generate_perf_report(result_file_name): """ Generate a performance report. The input result_file_name should be the path of the "results.csv" generated by automlbenchmark. This function outputs 1) a formatted report named "performances.txt" in the "reports/" directory located in the same parent directory as "results.csv" and 2) a report named "rankings.txt" in the same directory ranking the tuners contained in "results.csv". """ result = pd.read_csv(result_file_name) task_ids = result['id'].unique() tuners = result['framework'].unique() metric_types = ['rmse', 'auc', 'logloss'] metric2taskres = {} for m in metric_types: metric2taskres[m] = [] keep_parameters = ['framework', 'constraint', 'result', 'metric', 'params', 'utc', 'duration'] + list(result.columns[16:]) # performance report: one table per task with open(result_file_name.replace('results.csv', 'reports/performances.txt'), 'w') as out_f: for task_id in task_ids: task_results = result[result['id'] == task_id] task_name = task_results.task.unique()[0] out_f.write("====================================================\n") out_f.write("Task ID: {}\n".format(task_id)) out_f.write("Task Name: {}\n".format(task_name)) folds = task_results['fold'].unique() for fold in folds: out_f.write("Fold {}:\n".format(fold)) res = task_results[task_results['fold'] == fold][keep_parameters] out_f.write(res.to_string()) out_f.write('\n') # save results for the next step res_list = [] for _, row in res.iterrows(): res_list.append([row['framework'], row['result']]) metric2taskres[res['metric'].unique()[0]].append(res_list) out_f.write('\n') # rankings report: per task and per tuner with open(result_file_name.replace('results.csv', 'reports/rankings.txt'), 'w') as out_f: # generate reports per task ranking_aggs = {} for metric_type in metric_types: sorted_lists = [] if metric_type in ['auc']: for l in metric2taskres[metric_type]: l_sorted = sorted(l, key=(lambda x: x[-1]), reverse=True) l_sorted = [[x[0], x[1], i+1] for (i, x) in enumerate(l_sorted)] sorted_lists.append(l_sorted) elif metric_type in ['rmse', 'logloss']: for l in metric2taskres[metric_type]: l_sorted = sorted(l, key=(lambda x: x[-1])) l_sorted = [[x[0], x[1], i+1] for (i, x) in enumerate(l_sorted)] sorted_lists.append(l_sorted) metric2taskres[metric_type] = sorted_lists out_f.write("====================================================\n") out_f.write("Average rankings for metric {}:\n".format(metric_type)) ranking_agg = [[t, 0] for t in tuners] for i, tuner in enumerate(tuners): for trial_res in metric2taskres[metric_type]: for t, s, r in trial_res: if t == tuner: ranking_agg[i][-1] += r ranking_agg = [[x[0], x[1]/float(len(metric2taskres[metric_type]))] for x in ranking_agg] ranking_agg = sorted(ranking_agg, key=(lambda x: x[-1])) for t, r in ranking_agg: out_f.write('{:<12} {:.2f}\n'.format(t, r)) ranking_aggs[metric_type] = ranking_agg out_f.write('\n') # generate reports per tuner out_f.write("====================================================\n") out_f.write("Average rankings for tuners:\n") header_string = '{:<12}' for _ in metric_types: header_string += ' {:<12}' header_string += '\n' out_f.write(header_string.format("Tuner", *metric_types)) for tuner in tuners: tuner_ranks = [] for m in metric_types: for t, r in ranking_aggs[m]: if t == tuner: tuner_ranks.append('{:.2f}'.format(r)) break out_f.write(header_string.format(tuner, *tuner_ranks)) out_f.write('\n') def generate_graphs(result_file_name): """ Generate graphs describing performance statistics. The input result_file_name should be the path of the "results.csv" generated by automlbenchmark. For each task, this function outputs two graphs in the "reports/" directory located in the same parent directory as "results.csv". The graph named task_foldx_1.jpg summarizes the best score each tuner gets after n trials. The graph named task_foldx_2.jpg summarizes the score each tuner gets in each trial. """ markers = list(Line2D.markers.keys()) result = pd.read_csv(result_file_name) scorelog_dir = result_file_name.replace('results.csv', 'scorelogs/') output_dir = result_file_name.replace('results.csv', 'reports/') task_ids = result['id'].unique() for task_id in task_ids: task_results = result[result['id'] == task_id] task_name = task_results.task.unique()[0] folds = task_results['fold'].unique() for fold in folds: # load scorelog files trial_scores, best_scores = [], [] tuners = list(task_results[task_results.fold == fold]['framework'].unique()) for tuner in tuners: scorelog_name = '{}_{}_{}.csv'.format(tuner.lower(), task_name, fold) intermediate_scores = pd.read_csv(scorelog_dir + scorelog_name) bs = list(intermediate_scores['best_score']) ts = [(i+1, x) for i, x in enumerate(list(intermediate_scores['trial_score'])) if x != 0] best_scores.append([tuner, bs]) trial_scores.append([tuner, ts]) # generate the best score graph plt.figure(figsize=(16, 8)) for i, (tuner, score) in enumerate(best_scores): plt.plot(score, label=tuner, marker=markers[i]) plt.title('{} Fold {}'.format(task_name, fold)) plt.xlabel("Number of Trials") plt.ylabel("Best Score") plt.legend() plt.savefig(output_dir + '{}_fold{}_1.jpg'.format(task_name, fold)) plt.close() # generate the trial score graph plt.figure(figsize=(16, 8)) for i, (tuner, score) in enumerate(trial_scores): x = [l[0] for l in score] y = [l[1] for l in score] plt.plot(x, y, label=tuner) #, marker=markers[i]) plt.title('{} Fold {}'.format(task_name, fold)) plt.xlabel("Trial Number") plt.ylabel("Trial Score") plt.legend() plt.savefig(output_dir + '{}_fold{}_2.jpg'.format(task_name, fold)) plt.close() def main(): if len(sys.argv) != 2: print("Usage: python parse_result_csv.py ") exit(0) generate_perf_report(sys.argv[1]) generate_graphs(sys.argv[1]) if __name__ == '__main__': main()