parse_result_csv.py

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import pandas as pd
import sys
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


def generate_perf_report(result_file_name):
    """
    Generate a performance report. 
    The input result_file_name should be the path of the "results.csv" generated by automlbenchmark.
    This function outputs 1) a formatted report named "performances.txt" in the "reports/" directory 
    located in the same parent directory as "results.csv" and 2) a report named "rankings.txt" in the
    same directory ranking the tuners contained in "results.csv". 
    """
    result = pd.read_csv(result_file_name)
    task_ids = result['id'].unique()
    tuners = result['framework'].unique()
    metric_types = ['rmse', 'auc', 'logloss']
    metric2taskres = {}
    for m in metric_types:
        metric2taskres[m] = []
    keep_parameters = ['framework', 'constraint', 'result', 'metric', 'params', 'utc', 'duration'] + list(result.columns[16:])    

    # performance report: one table per task
    with open(result_file_name.replace('results.csv', 'reports/performances.txt'), 'w') as out_f:
        for task_id in task_ids:
            task_results = result[result['id'] == task_id]
            task_name = task_results.task.unique()[0]
            out_f.write("====================================================\n")
            out_f.write("Task ID: {}\n".format(task_id))
            out_f.write("Task Name: {}\n".format(task_name))
            folds = task_results['fold'].unique()
            for fold in folds:
                out_f.write("Fold {}:\n".format(fold))
                res = task_results[task_results['fold'] == fold][keep_parameters]
                out_f.write(res.to_string())
                out_f.write('\n')
                # save results for the next step
                res_list = []
                for _, row in res.iterrows():
                    res_list.append([row['framework'], row['result']])
                metric2taskres[res['metric'].unique()[0]].append(res_list)
            out_f.write('\n')

    # rankings report: per task and per tuner    
    with open(result_file_name.replace('results.csv', 'reports/rankings.txt'), 'w') as out_f:
        # generate reports per task
        ranking_aggs = {}
        for metric_type in metric_types:
            sorted_lists = []
            if metric_type in ['auc']:
                for l in metric2taskres[metric_type]:
                    l_sorted = sorted(l, key=(lambda x: x[-1]), reverse=True)
                    l_sorted = [[x[0], x[1], i+1] for (i, x) in enumerate(l_sorted)]
                    sorted_lists.append(l_sorted)
            elif metric_type in ['rmse', 'logloss']:
                for l in metric2taskres[metric_type]:
                    l_sorted = sorted(l, key=(lambda x: x[-1]))
                    l_sorted = [[x[0], x[1], i+1] for (i, x) in enumerate(l_sorted)]
                    sorted_lists.append(l_sorted)
            metric2taskres[metric_type] = sorted_lists

            out_f.write("====================================================\n")
            out_f.write("Average rankings for metric {}:\n".format(metric_type))
            ranking_agg = [[t, 0] for t in tuners]
            for i, tuner in enumerate(tuners):
                for trial_res in metric2taskres[metric_type]:
                    for t, s, r in trial_res:
                        if t == tuner:
                            ranking_agg[i][-1] += r

            ranking_agg = [[x[0], x[1]/float(len(metric2taskres[metric_type]))] for x in ranking_agg]
            ranking_agg = sorted(ranking_agg, key=(lambda x: x[-1]))
            for t, r in ranking_agg:
                out_f.write('{:<12} {:.2f}\n'.format(t, r))
            ranking_aggs[metric_type] = ranking_agg
            out_f.write('\n') 

        # generate reports per tuner
        out_f.write("====================================================\n")
        out_f.write("Average rankings for tuners:\n")
        header_string = '{:<12}'
        for _ in metric_types:
            header_string += ' {:<12}'
        header_string += '\n'
        out_f.write(header_string.format("Tuner", *metric_types))    
        for tuner in tuners:
            tuner_ranks = []
            for m in metric_types:
                for t, r in ranking_aggs[m]:
                    if t == tuner:
                        tuner_ranks.append('{:.2f}'.format(r))
                        break
            out_f.write(header_string.format(tuner, *tuner_ranks))
        out_f.write('\n')


def generate_graphs(result_file_name):
    """
    Generate graphs describing performance statistics.
    The input result_file_name should be the path of the "results.csv" generated by automlbenchmark.
    For each task, this function outputs two graphs in the "reports/" directory located in the same 
    parent directory as "results.csv".
    The graph named task_foldx_1.jpg summarizes the best score each tuner gets after n trials. 
    The graph named task_foldx_2.jpg summarizes the score each tuner gets in each trial. 
    """
    markers = list(Line2D.markers.keys())
    result = pd.read_csv(result_file_name)
    scorelog_dir = result_file_name.replace('results.csv', 'scorelogs/')
    output_dir = result_file_name.replace('results.csv', 'reports/') 
    task_ids = result['id'].unique()
    for task_id in task_ids:
        task_results = result[result['id'] == task_id]
        task_name = task_results.task.unique()[0]
        folds = task_results['fold'].unique()        

        for fold in folds:            
            # load scorelog files
            trial_scores, best_scores = [], []
            tuners = list(task_results[task_results.fold == fold]['framework'].unique())
            for tuner in tuners:
                scorelog_name = '{}_{}_{}.csv'.format(tuner.lower(), task_name, fold)
                intermediate_scores = pd.read_csv(scorelog_dir + scorelog_name)
                bs = list(intermediate_scores['best_score'])
                ts = [(i+1, x) for i, x in enumerate(list(intermediate_scores['trial_score'])) if x != 0]
                best_scores.append([tuner, bs])
                trial_scores.append([tuner, ts])

            # generate the best score graph
            plt.figure(figsize=(16, 8))
            for i, (tuner, score) in enumerate(best_scores):
                plt.plot(score, label=tuner, marker=markers[i])
            plt.title('{} Fold {}'.format(task_name, fold))
            plt.xlabel("Number of Trials")
            plt.ylabel("Best Score")        
            plt.legend()
            plt.savefig(output_dir + '{}_fold{}_1.jpg'.format(task_name, fold))
            plt.close()

            # generate the trial score graph
            plt.figure(figsize=(16, 8))
            for i, (tuner, score) in enumerate(trial_scores):
                x = [l[0] for l in score]
                y = [l[1] for l in score] 
                plt.plot(x, y, label=tuner)      #, marker=markers[i])
            plt.title('{} Fold {}'.format(task_name, fold))
            plt.xlabel("Trial Number")
            plt.ylabel("Trial Score")        
            plt.legend()
            plt.savefig(output_dir + '{}_fold{}_2.jpg'.format(task_name, fold))
            plt.close()

            
def main():
    if len(sys.argv) != 2:
        print("Usage: python parse_result_csv.py <result.csv file>")
        exit(0)
    generate_perf_report(sys.argv[1])
    generate_graphs(sys.argv[1])
    

if __name__ == '__main__':
    main()