Analyzer: Add Feature - Add basic analysis features (#248)

**Description** Add basic analysis features. **Major Revision** - Add statistics, correlations of the raw data - Add numeric outlier detection(inter_quartile_range) - Add boxplot for selected metric

Analyzer: Add Feature - Add basic analysis features (#248)
**Description** Add basic analysis features. **Major Revision** - Add statistics, correlations of the raw data - Add numeric outlier detection(inter_quartile_range) - Add boxplot for selected metric
c2f942cb · Yuting Jiang · GitHub · 6e357fb9 · c2f942cb · c2f942cb
Unverified Commit c2f942cb authored Dec 10, 2021 by Yuting Jiang Committed by GitHub Dec 10, 2021
4 changed files
--- a/.azure-pipelines/cuda-unit-test.yml
+++ b/.azure-pipelines/cuda-unit-test.yml
@@ -30,7 +30,7 @@ steps:
  - script: |
      SB_MICRO_PATH=$PWD python3 setup.py test
    displayName: Run unit tests
-    timeoutInMinutes: 10
+    timeoutInMinutes: 15
  - script: |
      bash <(curl -s https://codecov.io/bash) -cF cuda-unit-test
    displayName: Report coverage results

--- a/setup.py
+++ b/setup.py
@@ -140,11 +140,13 @@ setup(
        'joblib>=1.0.1',
        'jsonlines>=2.0.0',
        'knack>=0.7.2',
+        'matplotlib>=3.0.0',
        'natsort>=7.1.1',
        'openpyxl>=3.0.7',
        'omegaconf==2.0.6',
        'pandas>=1.1.5',
        'pyyaml>=5.3',
+        'seaborn>=0.11.2',
        'tcping>=0.1.1rc1',
        'xlrd>=2.0.1',
        'xlsxwriter>=1.3.8',

--- a/superbench/analyzer/data_analysis.py
+++ b/superbench/analyzer/data_analysis.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for data analysis."""
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from superbench.common.utils import logger
+
+
+def statistic(raw_data_df):
+    """Get the statistics of the raw data.
+
+    The statistics include count, mean, std, min, max, 1%, 5%, 25%, 50%, 75%, 95%, 99%.
+
+    Args:
+        raw_data_df (DataFrame): raw data
+
+    Returns:
+        DataFrame: data statistics
+    """
+    data_statistics_df = pd.DataFrame()
+    if not isinstance(raw_data_df, pd.DataFrame):
+        logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
+        return data_statistics_df
+    if len(raw_data_df) == 0:
+        logger.warning('DataAnalyzer: empty data.')
+        return data_statistics_df
+    try:
+        data_statistics_df = raw_data_df.describe()
+        data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01)
+        data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05)
+        data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95)
+        data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99)
+        statistics_error = []
+        for column in list(raw_data_df.columns):
+            if column not in list(data_statistics_df.columns) and not raw_data_df[column].isnull().all():
+                statistics_error.append(column)
+        if statistics_error:
+            logger.warning(
+                'DataAnalyzer: [{}] is missing in statistics results.'.format(
+                    ','.join(str(x) for x in statistics_error)
+                )
+            )
+    except Exception as e:
+        logger.error('DataAnalyzer: statistic failed, msg: {}'.format(str(e)))
+    return data_statistics_df
+
+
+def interquartile_range(raw_data_df):
+    """Get outlier detection bounds using IQR method.
+
+     The reference of IQR is https://en.wikipedia.org/wiki/Interquartile_range.
+     Get the mild and extreme outlier upper and lower value and bound.
+     values:
+        Mild Outlier: A point beyond inner whiskers on either side
+            lower whisker: Q1 - 1.5*IQR
+            upper whisker : Q3 + 1.5*IQR
+        Extreme Outlier: A point beyond outer whiskers on either side
+            lower whisker : Q1 - 3*IQR
+            upper whisker : Q3 + 3*IQR
+     bounds:
+        (values - mean) / mean
+
+    Args:
+        raw_data_df (DataFrame): raw data
+
+    Returns:
+        DataFrame: data statistics and IQR bound
+    """
+    if not isinstance(raw_data_df, pd.DataFrame):
+        logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
+        return pd.DataFrame()
+    if len(raw_data_df) == 0:
+        logger.warning('DataAnalyzer: empty data.')
+        return pd.DataFrame()
+    try:
+        data_statistics_df = statistic(raw_data_df)
+        data_statistics_df.loc['mild_outlier_upper'] = data_statistics_df.loc[
+            '75%'] + 1.5 * (data_statistics_df.loc['75%'] - data_statistics_df.loc['25%'])
+        data_statistics_df.loc['extreme_outlier_upper'] = data_statistics_df.loc[
+            '75%'] + 3 * (data_statistics_df.loc['75%'] - data_statistics_df.loc['25%'])
+        data_statistics_df.loc['mild_outlier_lower'] = data_statistics_df.loc[
+            '25%'] - 1.5 * (data_statistics_df.loc['75%'] - data_statistics_df.loc['25%'])
+        data_statistics_df.loc['extreme_outlier_lower'] = data_statistics_df.loc[
+            '25%'] - 3 * (data_statistics_df.loc['75%'] - data_statistics_df.loc['25%'])
+        data_statistics_df.loc['mild_outlier_upper_bound'] = (
+            data_statistics_df.loc['mild_outlier_upper'] - data_statistics_df.loc['mean']
+        ) / data_statistics_df.loc['mean']
+        data_statistics_df.loc['extreme_outlier_upper_bound'] = (
+            data_statistics_df.loc['extreme_outlier_upper'] - data_statistics_df.loc['mean']
+        ) / data_statistics_df.loc['mean']
+        data_statistics_df.loc['mild_outlier_lower_bound'] = (
+            data_statistics_df.loc['mild_outlier_lower'] - data_statistics_df.loc['mean']
+        ) / data_statistics_df.loc['mean']
+        data_statistics_df.loc['extreme_outlier_lower_bound'] = (
+            data_statistics_df.loc['extreme_outlier_lower'] - data_statistics_df.loc['mean']
+        ) / data_statistics_df.loc['mean']
+    except Exception as e:
+        logger.error('DataAnalyzer: interquartile_range failed, msg: {}'.format(str(e)))
+    return data_statistics_df
+
+
+def correlation(raw_data_df):
+    """Get the correlations.
+
+    Args:
+        raw_data_df (DataFrame): raw data
+
+    Returns:
+        DataFrame: correlations
+    """
+    data_corr_df = pd.DataFrame()
+    if not isinstance(raw_data_df, pd.DataFrame):
+        logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
+        return data_corr_df
+    if len(raw_data_df) == 0:
+        logger.warning('DataAnalyzer: empty data.')
+        return data_corr_df
+    try:
+        data_corr_df = raw_data_df.corr()
+        statistics_error = []
+        for column in list(raw_data_df.columns):
+            if column not in list(data_corr_df.columns) and not raw_data_df[column].isnull().all():
+                statistics_error.append(column)
+        if statistics_error:
+            logger.warning(
+                'DataAnalyzer: [{}] is missing in correlation results.'.format(
+                    ','.join(str(x) for x in statistics_error)
+                )
+            )
+    except Exception as e:
+        logger.error('DataAnalyzer: correlation failed, msg: {}'.format(str(e)))
+    return data_corr_df
+
+
+def creat_boxplot(raw_data_df, columns, output_dir):
+    """Plot the boxplot for selected columns.
+
+    Args:
+        raw_data_df (DataFrame): raw data
+        columns (list): selected metrics to plot the boxplot
+        output_dir (str): the directory of output file
+    """
+    if not isinstance(raw_data_df, pd.DataFrame):
+        logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
+        return
+    if len(raw_data_df) == 0:
+        logger.error('DataAnalyzer: empty data for boxplot.')
+        return
+    if not isinstance(columns, list):
+        logger.error('DataAnalyzer: the type of columns should be list.')
+        return
+    try:
+        data_columns = raw_data_df.columns
+        for column in columns:
+            if column not in data_columns or raw_data_df[column].dtype is not np.dtype('float'):
+                logger.warning('DataAnalyzer: invalid column {} for boxplot.'.format(column))
+                columns.remove(column)
+        n = len(columns)
+        for i in range(n):
+            sns.set(style='whitegrid')
+            plt.subplot(n, 1, i + 1)
+            sns.boxplot(x=columns[i], data=raw_data_df, orient='h')
+        plt.subplots_adjust(hspace=1)
+        plt.savefig(output_dir + '/boxplot.png')
+        plt.show()
+    except Exception as e:
+        logger.error('DataAnalyzer: creat_boxplot failed, msg: {}'.format(str(e)))
+
+
+def generate_baseline(raw_data_df, output_dir):
+    """Export baseline to json file.
+
+    Args:
+        raw_data_df (DataFrame): raw data
+        output_dir (str): the directory of output file
+    """
+    try:
+        if not isinstance(raw_data_df, pd.DataFrame):
+            logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
+            return
+        if len(raw_data_df) == 0:
+            logger.error('DataAnalyzer: empty data.')
+            return
+        mean_df = raw_data_df.mean()
+        mean_df.to_json(output_dir + '/baseline.json')
+    except Exception as e:
+        logger.error('DataAnalyzer: generate baseline failed, msg: {}'.format(str(e)))
--- a/tests/analylzer/test_data_analysis.py
+++ b/tests/analylzer/test_data_analysis.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for data analysis module."""
+
+from pathlib import Path
+import json
+import unittest
+
+import numpy as np
+import pandas as pd
+
+import superbench.analyzer.data_analysis as data_analysis
+
+
+class TestDataAnalysis(unittest.TestCase):
+    """Test for DataAnalysis class."""
+    def setUp(self):
+        """Method called to prepare the test fixture."""
+        self.output_dir = str(Path(__file__).parent.resolve())
+        self.fig = self.output_dir + '/boxplot.png'
+        self.baseline = self.output_dir + '/baseline.json'
+
+    def tearDown(self):
+        """Method called after the test method has been called and the result recorded."""
+        p = Path(self.fig)
+        if p.is_file():
+            p.unlink()
+        p = Path(self.baseline)
+        if p.is_file():
+            p.unlink()
+
+    def test_data_analysis(self):
+        """Test for data analysis."""
+        # Test - statistic
+        co1 = np.random.rand(100)
+        co2 = np.random.rand(100)
+        co3 = np.random.rand(100)
+        raw_data_df = pd.DataFrame({'a': co1, 'b': co2, 'c': co3})
+        data_statistics_df = data_analysis.statistic(raw_data_df)
+        assert (len(data_statistics_df) == 12)
+        assert (len(data_statistics_df.columns) == 3)
+        raw_data_df['d'] = ['a' for i in range(100)]
+        data_statistics_df = data_analysis.statistic(raw_data_df)
+        assert (len(data_statistics_df.columns) == 3)
+        # Test - inter_quartile_range
+        data_statistics_df = data_analysis.interquartile_range(raw_data_df)
+        assert (len(data_statistics_df) == 20)
+        assert (len(data_statistics_df.columns) == 3)
+        # Test - correlation
+        data_corr_df = data_analysis.correlation(raw_data_df)
+        assert (len(data_corr_df) == 3)
+        # Test - creat_boxplot
+        data_analysis.creat_boxplot(raw_data_df, list(raw_data_df.columns), self.output_dir)
+        fig = Path(self.fig)
+        assert (fig.is_file())
+        fig.unlink()
+        # Test - generate baseline
+        data_analysis.generate_baseline(raw_data_df, self.output_dir)
+        baseline_path = Path(self.baseline)
+        with baseline_path.open() as load_f:
+            baseline = json.load(load_f)
+        baseline_path.unlink()
+        assert (len(baseline) == 3)
+        # Test for invalid input
+        raw_data_dict = {}
+        assert (len(data_analysis.statistic(raw_data_dict)) == 0)
+        assert (len(data_analysis.interquartile_range(raw_data_dict)) == 0)
+        assert (len(data_analysis.correlation(raw_data_dict)) == 0)