HPO Benchmark (#3644)

4c49db12 · Bill Wu · GitHub · 4ccc9402 · 4c49db12 · 4c49db12
Unverified Commit 4c49db12 authored May 26, 2021 by Bill Wu Committed by GitHub May 26, 2021
12 changed files
--- a/examples/trials/benchmarking/automlbenchmark/nni/extensions/NNI/architectures/run_random_forest.py
+++ b/examples/trials/benchmarking/automlbenchmark/nni/extensions/NNI/architectures/run_random_forest.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import sklearn
+import time
+import numpy as np
+
+from sklearn.impute import SimpleImputer
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.model_selection import cross_val_score
+
+from amlb.benchmark import TaskConfig
+from amlb.data import Dataset
+from amlb.datautils import impute
+from amlb.utils import Timer
+from amlb.results import save_predictions_to_file
+
+
+SEARCH_SPACE = {
+    "n_estimators": {"_type":"randint", "_value": [8, 512]},
+    "max_depth": {"_type":"choice", "_value": [4, 8, 16, 32, 64, 128, 256, 0]},   # 0 for None
+    "min_samples_leaf": {"_type":"randint", "_value": [1, 8]},
+    "min_samples_split": {"_type":"randint", "_value": [2, 16]},
+    "max_leaf_nodes": {"_type":"randint", "_value": [0, 4096]}                    # 0 for None
+}
+
+SEARCH_SPACE_CHOICE = {
+    "n_estimators": {"_type":"choice", "_value": [8, 16, 32, 64, 128, 256, 512]},
+    "max_depth": {"_type":"choice", "_value": [4, 8, 16, 32, 64, 128, 0]},   # 0 for None
+    "min_samples_leaf": {"_type":"choice", "_value": [1, 2, 4, 8]},
+    "min_samples_split": {"_type":"choice", "_value": [2, 4, 8, 16]},
+    "max_leaf_nodes": {"_type":"choice", "_value": [8, 32, 128, 512, 0]}     # 0 for None
+}
+
+SEARCH_SPACE_SIMPLE = {
+    "n_estimators": {"_type":"choice", "_value": [10]},
+    "max_depth": {"_type":"choice", "_value": [5]},
+    "min_samples_leaf": {"_type":"choice", "_value": [8]},
+    "min_samples_split": {"_type":"choice", "_value": [16]},
+    "max_leaf_nodes": {"_type":"choice", "_value": [64]}
+}
+
+
+def preprocess_random_forest(dataset, log):
+    '''
+    For random forest:
+    - Do nothing for numerical features except null imputation. 
+    - For categorical features, use ordinal encoding to map them into integers. 
+    '''
+    cat_columns, num_columns = [], []
+    shift_amount = 0
+    for i, f in enumerate(dataset.features):
+        if f.is_target:
+            shift_amount += 1
+            continue
+        elif f.is_categorical():
+            cat_columns.append(i - shift_amount)
+        else:
+            num_columns.append(i - shift_amount)
+
+    cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
+                             ('ordinal_encoder', OrdinalEncoder()),
+                             ])
+    
+    num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')),
+                             ])
+    
+    data_pipeline = ColumnTransformer([
+        ('categorical', cat_pipeline, cat_columns),
+        ('numerical', num_pipeline, num_columns),
+    ])
+
+    data_pipeline.fit(np.concatenate([dataset.train.X, dataset.test.X], axis=0))
+    
+    X_train = data_pipeline.transform(dataset.train.X)
+    X_test = data_pipeline.transform(dataset.test.X)  
+    
+    return X_train, X_test
+
+    
+def run_random_forest(dataset, config, tuner, log):
+    """
+    Using the given tuner, tune a random forest within the given time constraint.
+    This function uses cross validation score as the feedback score to the tuner. 
+    The search space on which tuners search on is defined above empirically as a global variable.
+    """
+    
+    limit_type, trial_limit = config.framework_params['limit_type'], None
+    if limit_type == 'ntrials':
+        trial_limit = int(config.framework_params['trial_limit'])
+    
+    X_train, X_test = preprocess_random_forest(dataset, log)
+    y_train, y_test = dataset.train.y, dataset.test.y
+
+    is_classification = config.type == 'classification'
+    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
+
+    best_score, best_params, best_model = None, None, None
+    score_higher_better = True
+
+    tuner.update_search_space(SEARCH_SPACE)    
+    
+    start_time = time.time()
+    trial_count = 0
+    intermediate_scores = []
+    intermediate_best_scores = []           # should be monotonically increasing 
+    
+    while True:
+        try:
+            trial_count += 1
+            param_idx, cur_params = tuner.generate_parameters()
+            train_params = cur_params.copy()
+            if 'TRIAL_BUDGET' in cur_params:
+                train_params.pop('TRIAL_BUDGET')
+            if cur_params['max_leaf_nodes'] == 0: 
+                train_params.pop('max_leaf_nodes')
+            if cur_params['max_depth'] == 0:
+                train_params.pop('max_depth')
+            log.info("Trial {}: \n{}\n".format(param_idx, cur_params))
+                
+            cur_model = estimator(random_state=config.seed, **train_params)
+            
+            # Here score is the output of score() from the estimator
+            cur_score = cross_val_score(cur_model, X_train, y_train)
+            cur_score = sum(cur_score) / float(len(cur_score))
+            if np.isnan(cur_score):
+                cur_score = 0
+            
+            log.info("Score: {}\n".format(cur_score))
+            if best_score is None or (score_higher_better and cur_score > best_score) or (not score_higher_better and cur_score < best_score):
+                best_score, best_params, best_model = cur_score, cur_params, cur_model    
+            
+            intermediate_scores.append(cur_score)
+            intermediate_best_scores.append(best_score)
+            tuner.receive_trial_result(param_idx, cur_params, cur_score)
+
+            if limit_type == 'time':
+                current_time = time.time()
+                elapsed_time = current_time - start_time
+                if elapsed_time >= config.max_runtime_seconds:
+                    break
+            elif limit_type == 'ntrials':
+                if trial_count >= trial_limit:
+                    break
+        except:
+            break
+
+    # This line is required to fully terminate some advisors
+    tuner.handle_terminate()
+        
+    log.info("Tuning done, the best parameters are:\n{}\n".format(best_params))
+
+    # retrain on the whole dataset 
+    with Timer() as training:
+        best_model.fit(X_train, y_train)     
+    predictions = best_model.predict(X_test)
+    probabilities = best_model.predict_proba(X_test) if is_classification else None
+
+    return probabilities, predictions, training, y_test, intermediate_scores, intermediate_best_scores
--- a/examples/trials/benchmarking/automlbenchmark/nni/extensions/NNI/exec.py
+++ b/examples/trials/benchmarking/automlbenchmark/nni/extensions/NNI/exec.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+
+from .tuners import NNITuner
+from .run_experiment import *
+
+from amlb.benchmark import TaskConfig
+from amlb.data import Dataset
+from amlb.results import save_predictions_to_file
+from amlb.utils import Timer
+
+
+log = logging.getLogger(__name__)
+
+
+def validate_config(config: TaskConfig):
+    if 'tuner_type' not in config.framework_params:
+        raise RuntimeError('framework.yaml does not have a "tuner_type" field.')
+    if 'limit_type' not in config.framework_params:
+        raise RuntimeError('framework.yaml does not have a "limit_type" field.')
+    if config.framework_params['limit_type'] not in ['time', 'ntrials']:
+        raise RuntimeError('"limit_type" field must be "time" or "ntrials".')
+    if config.framework_params['limit_type'] == 'ntrials':
+        if 'trial_limit' not in config.framework_params:
+            raise RuntimeError('framework.yaml does not have a "limit" field.')
+        else:
+            try:
+                _ = int(config.framework_params['trial_limit'])
+            except:
+                raise RuntimeError('"trial_limit" field must be an integer.')  
+
+
+def save_scores_to_file(intermediate_scores, intermediate_best_scores, out_file):
+    """
+    Save statistics of every trial to a log file for generating reports. 
+    """
+    with open(out_file, 'w') as f:
+        f.write('ntrials,trial_score,best_score\n')
+        for i, (trial_score, best_score) in enumerate(zip(intermediate_scores, intermediate_best_scores)):
+            f.write('{},{},{}\n'.format(i+1, trial_score, best_score))
+            
+    
+def run(dataset: Dataset, config: TaskConfig):
+    validate_config(config)
+    tuner = NNITuner(config)
+    if config.framework_params['limit_type']  == 'time':
+        log.info("Tuning {} with NNI {} with a maximum time of {}s\n"
+                 .format(config.framework_params['arch_type'], tuner.description, config.max_runtime_seconds))
+    elif config.framework_params['limit_type'] == 'ntrials':
+        log.info("Tuning {} with NNI {} with a maximum number of trials of {}\n"
+                 .format(config.framework_params['arch_type'], tuner.description, config.framework_params['trial_limit']))
+        log.info("Note: any time constraints are ignored.")
+
+    probabilities, predictions, train_timer, y_test, intermediate_scores, intermediate_best_scores = run_experiment(dataset, config, tuner, log)
+
+    save_predictions_to_file(dataset=dataset,
+                             output_file=config.output_predictions_file,
+                             probabilities=probabilities,
+                             predictions=predictions,
+                             truth=y_test)
+
+    scores_file = '/'.join(config.output_predictions_file.split('/')[:-3]) + '/scorelogs/' + config.output_predictions_file.split('/')[-1]
+    assert(len(intermediate_scores) == len(intermediate_best_scores))
+    save_scores_to_file(intermediate_scores, intermediate_best_scores, scores_file)
+
+    return dict(
+        models_count=1,
+        training_duration=train_timer.duration
+    )
--- a/examples/trials/benchmarking/automlbenchmark/nni/extensions/NNI/run_experiment.py
+++ b/examples/trials/benchmarking/automlbenchmark/nni/extensions/NNI/run_experiment.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .architectures.run_random_forest import *
+
+
+def run_experiment(dataset, config, tuner, log):
+    if 'arch_type' not in config.framework_params:
+        raise RuntimeError('framework.yaml does not have a "arch_type" field.')
+    
+    if config.framework_params['arch_type'] == 'random_forest':
+        return run_random_forest(dataset, config, tuner, log)
+
+    else:
+        raise RuntimeError('The requested arch type in framework.yaml is unavailable.') 
--- a/examples/trials/benchmarking/automlbenchmark/nni/extensions/NNI/tuners.py
+++ b/examples/trials/benchmarking/automlbenchmark/nni/extensions/NNI/tuners.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import yaml
+import importlib
+
+import nni
+from nni.runtime.config import get_config_file 
+from nni.utils import MetricType 
+from nni.tuner import Tuner
+from nni.runtime.msg_dispatcher_base import MsgDispatcherBase
+
+from amlb.benchmark import TaskConfig
+
+
+def get_tuner_class_dict():
+    config_file = str(get_config_file('registered_algorithms.yml'))
+    if os.path.exists(config_file):
+        with open(config_file, 'r') as f:
+            config = yaml.load(f, Loader=yaml.SafeLoader)
+    else:
+        config = {}
+    ret = {}
+    for t in ['tuners', 'advisors']:
+        for entry in config[t]:
+            ret[entry['builtinName']] = entry['className']
+    return ret
+
+
+def get_tuner(config: TaskConfig):
+    name2tuner = get_tuner_class_dict()
+    if config.framework_params['tuner_type'] not in name2tuner:
+        raise RuntimeError('The requested tuner type is unavailable.')
+    else:
+        module_name = name2tuner[config.framework_params['tuner_type']]
+        tuner_name = module_name.split('.')[-1]
+        module_name = '.'.join(module_name.split('.')[:-1])
+        tuner_type = getattr(importlib.import_module(module_name), tuner_name)
+
+        # special handlings for tuner initialization
+        tuner = None
+        if config.framework_params['tuner_type'] == 'TPE':
+            tuner = tuner_type('tpe')
+
+        elif config.framework_params['tuner_type'] == 'Random':
+            tuner = tuner_type('random_search')
+
+        elif config.framework_params['tuner_type'] == 'Anneal':
+            tuner = tuner_type('anneal')
+
+        elif config.framework_params['tuner_type'] == 'Hyperband':
+            if 'max_resource' in config.framework_params:
+                tuner = tuner_type(R=config.framework_params['max_resource'])
+            else:
+                tuner = tuner_type()
+
+        elif config.framework_params['tuner_type'] == 'BOHB':
+            if 'max_resource' in config.framework_params:
+                tuner = tuner_type(max_budget=config.framework_params['max_resource'])
+            else:
+                tuner = tuner_type(max_budget=60)
+
+        else:
+            tuner = tuner_type()
+
+        assert(tuner is not None)
+
+        return tuner, config.framework_params['tuner_type']
+
+    
+class NNITuner:
+    '''
+    A specialized wrapper for the automlbenchmark framework.
+    Abstracts the different behaviors of tuners and advisors into a tuner API. 
+    '''
+    def __init__(self, config: TaskConfig):
+        self.config = config
+        self.core, self.description = get_tuner(config)
+
+        # 'tuner' or 'advisor'
+        self.core_type = None      
+        if isinstance(self.core, Tuner):
+            self.core_type = 'tuner'
+        elif isinstance(self.core, MsgDispatcherBase):
+            self.core_type = 'advisor'
+        else:
+            raise RuntimeError('Unsupported tuner or advisor type') 
+
+        # note: tuners and advisors use this variable differently
+        self.cur_param_id = 0
+
+        
+    def __del__(self):
+        self.handle_terminate()
+
+        
+    def update_search_space(self, search_space):
+        if self.core_type == 'tuner':
+            self.core.update_search_space(search_space)
+            
+        elif self.core_type == 'advisor':
+            self.core.handle_update_search_space(search_space)
+            # special initializations for BOHB Advisor
+            from nni.algorithms.hpo.hyperband_advisor import Hyperband
+            if isinstance(self.core, Hyperband):
+                pass
+            else:
+                from nni.algorithms.hpo.bohb_advisor.bohb_advisor import BOHB
+                from nni.algorithms.hpo.bohb_advisor.config_generator import CG_BOHB   
+                if isinstance(self.core, BOHB):
+                    self.core.cg = CG_BOHB(configspace=self.core.search_space,
+                                           min_points_in_model=self.core.min_points_in_model,
+                                           top_n_percent=self.core.top_n_percent,
+                                           num_samples=self.core.num_samples,
+                                           random_fraction=self.core.random_fraction,
+                                           bandwidth_factor=self.core.bandwidth_factor,
+                                           min_bandwidth=self.core.min_bandwidth)
+                    self.core.generate_new_bracket()
+                
+        
+    def generate_parameters(self):
+        self.cur_param_id += 1
+        if self.core_type == 'tuner':
+            self.cur_param = self.core.generate_parameters(self.cur_param_id-1)
+            return self.cur_param_id-1, self.cur_param
+            
+        elif self.core_type == 'advisor':
+            self.cur_param = self.core._get_one_trial_job()
+            hyperparams = self.cur_param['parameters'].copy()
+            #hyperparams.pop('TRIAL_BUDGET')
+            return self.cur_param['parameter_id'], hyperparams
+
+        
+    def receive_trial_result(self, parameter_id, parameters, value):
+        if self.core_type == 'tuner':
+            return self.core.receive_trial_result(parameter_id, parameters, value)
+
+        elif self.core_type == 'advisor':
+            metric_report = {}
+            metric_report['parameter_id'] = parameter_id
+            metric_report['trial_job_id'] = self.cur_param_id
+            metric_report['type'] = MetricType.FINAL
+            metric_report['value'] = str(value)
+            metric_report['sequence'] = self.cur_param_id
+            return self.core.handle_report_metric_data(metric_report)   
+
+        
+    def handle_terminate(self):
+        if self.core_type == 'tuner':
+            pass
+        
+        elif self.core_type == 'advisor':   
+            self.core.stopping = True 
+
+    
--- a/examples/trials/benchmarking/automlbenchmark/nni/frameworks.yaml
+++ b/examples/trials/benchmarking/automlbenchmark/nni/frameworks.yaml
+---
+
+NNI:
+  module: extensions.NNI
+  version: 'stable'
+  project: https://github.com/microsoft/nni
+
+# type in ['TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'GPTuner', 'MetisTuner', 'Hyperband', 'BOHB']
+# arch_type in ['random_forest']
+# limit_type in ['time', 'ntrials']
+# limit must be an integer
+
+TPE:
+  extends: NNI
+  params:
+    tuner_type: 'TPE'
+    arch_type: 'random_forest'
+    limit_type: 'ntrials'
+    trial_limit: 10
+
+Random:
+  extends: NNI 
+  params:
+    tuner_type: 'Random'
+    arch_type: 'random_forest'
+    limit_type: 'ntrials'
+    trial_limit: 10
+
+Anneal:
+  extends: NNI 
+  params:
+    tuner_type: 'Anneal'
+    arch_type: 'random_forest'
+    limit_type: 'ntrials'
+    trial_limit: 10
+
+Evolution:
+  extends: NNI 
+  params:
+    tuner_type: 'Evolution'
+    arch_type: 'random_forest'
+    limit_type: 'ntrials'
+    trial_limit: 10
+
+SMAC:
+  extends: NNI 
+  params:
+    tuner_type: 'SMAC'
+    arch_type: 'random_forest'
+    limit_type: 'ntrials'
+    trial_limit: 10
+
+GPTuner:
+  extends: NNI 
+  params:
+    tuner_type: 'GPTuner'
+    arch_type: 'random_forest'
+    limit_type: 'ntrials'
+    trial_limit: 10
+
+MetisTuner:
+  extends: NNI 
+  params:
+    tuner_type: 'MetisTuner'
+    arch_type: 'random_forest'
+    limit_type: 'ntrials'
+    trial_limit: 10
+
+Hyperband:
+  extends: NNI 
+  params:
+    tuner_type: 'Hyperband'
+    arch_type: 'random_forest'
+    max_resource: 60
+    limit_type: 'ntrials'
+    trial_limit: 10
+
+BOHB:
+  extends: NNI 
+  params:
+    tuner_type: 'BOHB'
+    arch_type: 'random_forest'
+    max_resource: 60
+    limit_type: 'ntrials'
+    trial_limit: 10
--- a/examples/trials/benchmarking/automlbenchmark/parse_result_csv.py
+++ b/examples/trials/benchmarking/automlbenchmark/parse_result_csv.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import pandas as pd
+import sys
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
+
+
+def generate_perf_report(result_file_name):
+    """
+    Generate a performance report. 
+    The input result_file_name should be the path of the "results.csv" generated by automlbenchmark.
+    This function outputs 1) a formatted report named "performances.txt" in the "reports/" directory 
+    located in the same parent directory as "results.csv" and 2) a report named "rankings.txt" in the
+    same directory ranking the tuners contained in "results.csv". 
+    """
+    result = pd.read_csv(result_file_name)
+    task_ids = result['id'].unique()
+    tuners = result['framework'].unique()
+    metric_types = ['rmse', 'auc', 'logloss']
+    metric2taskres = {}
+    for m in metric_types:
+        metric2taskres[m] = []
+    keep_parameters = ['framework', 'constraint', 'result', 'metric', 'params', 'utc', 'duration'] + list(result.columns[16:])    
+
+    # performance report: one table per task
+    with open(result_file_name.replace('results.csv', 'reports/performances.txt'), 'w') as out_f:
+        for task_id in task_ids:
+            task_results = result[result['id'] == task_id]
+            task_name = task_results.task.unique()[0]
+            out_f.write("====================================================\n")
+            out_f.write("Task ID: {}\n".format(task_id))
+            out_f.write("Task Name: {}\n".format(task_name))
+            folds = task_results['fold'].unique()
+            for fold in folds:
+                out_f.write("Fold {}:\n".format(fold))
+                res = task_results[task_results['fold'] == fold][keep_parameters]
+                out_f.write(res.to_string())
+                out_f.write('\n')
+                # save results for the next step
+                res_list = []
+                for _, row in res.iterrows():
+                    res_list.append([row['framework'], row['result']])
+                metric2taskres[res['metric'].unique()[0]].append(res_list)
+            out_f.write('\n')
+
+    # rankings report: per task and per tuner    
+    with open(result_file_name.replace('results.csv', 'reports/rankings.txt'), 'w') as out_f:
+        # generate reports per task
+        ranking_aggs = {}
+        for metric_type in metric_types:
+            sorted_lists = []
+            if metric_type in ['auc']:
+                for l in metric2taskres[metric_type]:
+                    l_sorted = sorted(l, key=(lambda x: x[-1]), reverse=True)
+                    l_sorted = [[x[0], x[1], i+1] for (i, x) in enumerate(l_sorted)]
+                    sorted_lists.append(l_sorted)
+            elif metric_type in ['rmse', 'logloss']:
+                for l in metric2taskres[metric_type]:
+                    l_sorted = sorted(l, key=(lambda x: x[-1]))
+                    l_sorted = [[x[0], x[1], i+1] for (i, x) in enumerate(l_sorted)]
+                    sorted_lists.append(l_sorted)
+            metric2taskres[metric_type] = sorted_lists
+
+            out_f.write("====================================================\n")
+            out_f.write("Average rankings for metric {}:\n".format(metric_type))
+            ranking_agg = [[t, 0] for t in tuners]
+            for i, tuner in enumerate(tuners):
+                for trial_res in metric2taskres[metric_type]:
+                    for t, s, r in trial_res:
+                        if t == tuner:
+                            ranking_agg[i][-1] += r
+
+            ranking_agg = [[x[0], x[1]/float(len(metric2taskres[metric_type]))] for x in ranking_agg]
+            ranking_agg = sorted(ranking_agg, key=(lambda x: x[-1]))
+            for t, r in ranking_agg:
+                out_f.write('{:<12} {:.2f}\n'.format(t, r))
+            ranking_aggs[metric_type] = ranking_agg
+            out_f.write('\n') 
+
+        # generate reports per tuner
+        out_f.write("====================================================\n")
+        out_f.write("Average rankings for tuners:\n")
+        header_string = '{:<12}'
+        for _ in metric_types:
+            header_string += ' {:<12}'
+        header_string += '\n'
+        out_f.write(header_string.format("Tuner", *metric_types))    
+        for tuner in tuners:
+            tuner_ranks = []
+            for m in metric_types:
+                for t, r in ranking_aggs[m]:
+                    if t == tuner:
+                        tuner_ranks.append('{:.2f}'.format(r))
+                        break
+            out_f.write(header_string.format(tuner, *tuner_ranks))
+        out_f.write('\n')
+
+
+def generate_graphs(result_file_name):
+    """
+    Generate graphs describing performance statistics.
+    The input result_file_name should be the path of the "results.csv" generated by automlbenchmark.
+    For each task, this function outputs two graphs in the "reports/" directory located in the same 
+    parent directory as "results.csv".
+    The graph named task_foldx_1.jpg summarizes the best score each tuner gets after n trials. 
+    The graph named task_foldx_2.jpg summarizes the score each tuner gets in each trial. 
+    """
+    markers = list(Line2D.markers.keys())
+    result = pd.read_csv(result_file_name)
+    scorelog_dir = result_file_name.replace('results.csv', 'scorelogs/')
+    output_dir = result_file_name.replace('results.csv', 'reports/') 
+    task_ids = result['id'].unique()
+    for task_id in task_ids:
+        task_results = result[result['id'] == task_id]
+        task_name = task_results.task.unique()[0]
+        folds = task_results['fold'].unique()        
+
+        for fold in folds:            
+            # load scorelog files
+            trial_scores, best_scores = [], []
+            tuners = list(task_results[task_results.fold == fold]['framework'].unique())
+            for tuner in tuners:
+                scorelog_name = '{}_{}_{}.csv'.format(tuner.lower(), task_name, fold)
+                intermediate_scores = pd.read_csv(scorelog_dir + scorelog_name)
+                bs = list(intermediate_scores['best_score'])
+                ts = [(i+1, x) for i, x in enumerate(list(intermediate_scores['trial_score'])) if x != 0]
+                best_scores.append([tuner, bs])
+                trial_scores.append([tuner, ts])
+
+            # generate the best score graph
+            plt.figure(figsize=(16, 8))
+            for i, (tuner, score) in enumerate(best_scores):
+                plt.plot(score, label=tuner, marker=markers[i])
+            plt.title('{} Fold {}'.format(task_name, fold))
+            plt.xlabel("Number of Trials")
+            plt.ylabel("Best Score")        
+            plt.legend()
+            plt.savefig(output_dir + '{}_fold{}_1.jpg'.format(task_name, fold))
+            plt.close()
+
+            # generate the trial score graph
+            plt.figure(figsize=(16, 8))
+            for i, (tuner, score) in enumerate(trial_scores):
+                x = [l[0] for l in score]
+                y = [l[1] for l in score] 
+                plt.plot(x, y, label=tuner)      #, marker=markers[i])
+            plt.title('{} Fold {}'.format(task_name, fold))
+            plt.xlabel("Trial Number")
+            plt.ylabel("Trial Score")        
+            plt.legend()
+            plt.savefig(output_dir + '{}_fold{}_2.jpg'.format(task_name, fold))
+            plt.close()
+
+            
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python parse_result_csv.py <result.csv file>")
+        exit(0)
+    generate_perf_report(sys.argv[1])
+    generate_graphs(sys.argv[1])
+    
+
+if __name__ == '__main__':
+    main()
--- a/examples/trials/benchmarking/automlbenchmark/requirements.txt
+++ b/examples/trials/benchmarking/automlbenchmark/requirements.txt
+pandas>=1.2.0
+pyyaml>=5.4.1
+matplotlib>=3.4.1
--- a/examples/trials/benchmarking/automlbenchmark/runbenchmark_nni.sh
+++ b/examples/trials/benchmarking/automlbenchmark/runbenchmark_nni.sh
+#!/bin/bash
+
+time=$(date "+%Y%m%d%H%M%S")
+installation='automlbenchmark'
+outdir="results_$time"
+benchmark='nnivalid'      # 'nnismall'  
+serialize=$true           # if false, run all experiments together in background
+
+mkdir $outdir $outdir/scorelogs $outdir/reports 
+
+if [ "$#" -eq 0 ]; then
+    tuner_array=('TPE' 'Random' 'Anneal' 'Evolution' 'GPTuner' 'MetisTuner' 'Hyperband')
+else
+    tuner_array=( "$@" )
+fi
+
+if [ $serialize ]; then
+    # run tuners serially 
+    for tuner in ${tuner_array[*]}; do
+	echo "python $installation/runbenchmark.py $tuner $benchmark -o $outdir -u nni"
+	python $installation/runbenchmark.py $tuner $benchmark -o $outdir -u nni
+    done
+
+    # parse final results
+    echo "python parse_result_csv.py $outdir/results.csv"
+    python parse_result_csv.py "$outdir/results.csv"
+
+else
+    # run all the tuners in background
+    for tuner in ${tuner_array[*]}; do
+	mkdir "$outdir/$tuner" "$outdir/$tuner/scorelogs"
+	echo "python $installation/runbenchmark.py $tuner $benchmark -o $outdir/$tuner -u nni &"
+	python $installation/runbenchmark.py $tuner $benchmark -o $outdir/$tuner -u nni &
+    done
+    
+    wait
+
+    # aggregate results
+    touch "$outdir/results.csv"
+    let i=0
+    for tuner in ${tuner_array[*]}; do
+	cp "$outdir/$tuner/scorelogs"/* $outdir/scorelogs
+	if [ $i -eq 0 ]; then
+	    cp "$outdir/$tuner/results.csv" "$outdir/results.csv"
+	else
+	    let nlines=`cat "$outdir/$tuner/results.csv" | wc -l`
+	    ((nlines=nlines-1))
+	    tail -n $nlines "$outdir/$tuner/results.csv" >> "$outdir/results.csv" 
+	fi
+	((i=i+1))
+    done
+
+    # parse final results
+    echo "python parse_result_csv.py $outdir/results.csv"
+    python parse_result_csv.py "$outdir/results.csv"
+fi
--- a/examples/trials/benchmarking/automlbenchmark/setup.sh
+++ b/examples/trials/benchmarking/automlbenchmark/setup.sh
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+#!/bin/bash
+
+# download automlbenchmark repository
+if [ ! -d './automlbenchmark' ] ; then
+    git clone https://github.com/openml/automlbenchmark.git --branch stable --depth 1
+fi
+
+# install dependencies 
+pip3 install -r automlbenchmark/requirements.txt
+pip3 install -r requirements.txt --ignore-installed
--- a/examples/trials/benchmarking/config_hyperband.yml
+++ b/examples/trials/benchmarking/config_hyperband.yml
--- a/examples/trials/benchmarking/main.py
+++ b/examples/trials/benchmarking/main.py
--- a/examples/trials/benchmarking/search_space.json
+++ b/examples/trials/benchmarking/search_space.json