Unverified Commit bcf6ea37 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Bug - Fix bugs in data diagnosis (#273)

**Description**
Fix bugs in data diagnosis.

**Major Revision**
- fix package import issue of file_handler
- deal with monitor metrics
- fix typo in output_path
parent ce1481dd
...@@ -5,12 +5,13 @@ ...@@ -5,12 +5,13 @@
import re import re
from typing import Callable from typing import Callable
from pathlib import Path
import pandas as pd import pandas as pd
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
import superbench.analyzer.file_handler as file_handler from superbench.analyzer import file_handler
class DataDiagnosis(): class DataDiagnosis():
...@@ -31,10 +32,15 @@ class DataDiagnosis(): ...@@ -31,10 +32,15 @@ class DataDiagnosis():
""" """
benchmarks_metrics = {} benchmarks_metrics = {}
for metric in metrics_list: for metric in metrics_list:
benchmark = metric.split('/')[0] if '/' not in metric:
if benchmark not in benchmarks_metrics: logger.warning(
benchmarks_metrics[benchmark] = set() 'DataDiagnosis: get_metrics_by_benchmarks - {} does not have benchmark_name'.format(metric)
benchmarks_metrics[benchmark].add(metric) )
else:
benchmark = metric.split('/')[0]
if benchmark not in benchmarks_metrics:
benchmarks_metrics[benchmark] = set()
benchmarks_metrics[benchmark].add(metric)
return benchmarks_metrics return benchmarks_metrics
def _check_rules(self, rule, name): def _check_rules(self, rule, name):
...@@ -133,6 +139,7 @@ class DataDiagnosis(): ...@@ -133,6 +139,7 @@ class DataDiagnosis():
if re.search(metric_regex, metric): if re.search(metric_regex, metric):
self._sb_rules[rule]['metrics'][metric] = self._get_baseline_of_metric(baseline, metric) self._sb_rules[rule]['metrics'][metric] = self._get_baseline_of_metric(baseline, metric)
self._enable_metrics.append(metric) self._enable_metrics.append(metric)
self._enable_metrics.sort()
except Exception as e: except Exception as e:
logger.error('DataDiagnosis: get criteria failed - {}'.format(str(e))) logger.error('DataDiagnosis: get criteria failed - {}'.format(str(e)))
return False return False
...@@ -171,8 +178,8 @@ class DataDiagnosis(): ...@@ -171,8 +178,8 @@ class DataDiagnosis():
issue_label = True issue_label = True
if issue_label: if issue_label:
# Add category information # Add category information
general_cat_str = ','.join(categories) general_cat_str = ','.join(sorted(list(categories)))
details_cat_str = ','.join(details) details_cat_str = ','.join(sorted((details)))
details_row = [general_cat_str, details_cat_str] details_row = [general_cat_str, details_cat_str]
return details_row, summary_data_row return details_row, summary_data_row
...@@ -236,15 +243,15 @@ class DataDiagnosis(): ...@@ -236,15 +243,15 @@ class DataDiagnosis():
try: try:
self._raw_data_df = file_handler.read_raw_data(raw_data_file) self._raw_data_df = file_handler.read_raw_data(raw_data_file)
self._metrics = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns)) self._metrics = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
logger.info('DataDiagnosis: Begin to processe {} nodes'.format(len(self._raw_data_df))) logger.info('DataDiagnosis: Begin to process {} nodes'.format(len(self._raw_data_df)))
data_not_accept_df, label_df = self.run_diagnosis_rules(rule_file, baseline_file) data_not_accept_df, label_df = self.run_diagnosis_rules(rule_file, baseline_file)
logger.info('DataDiagnosis: Processed finished') logger.info('DataDiagnosis: Processed finished')
outpout_path = '' output_path = ''
if output_format == 'excel': if output_format == 'excel':
output_path = output_dir + '/diagnosis_summary.xlsx' output_path = str(Path(output_dir) / 'diagnosis_summary.xlsx')
file_handler.output_excel(self._raw_data_df, data_not_accept_df, outpout_path, self._sb_rules) file_handler.output_excel(self._raw_data_df, data_not_accept_df, output_path, self._sb_rules)
elif output_format == 'json': elif output_format == 'json':
output_path = output_dir + '/diagnosis_summary.jsonl' output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
file_handler.output_json_data_not_accept(data_not_accept_df, output_path) file_handler.output_json_data_not_accept(data_not_accept_df, output_path)
else: else:
logger.error('DataDiagnosis: output failed - unsupported output format') logger.error('DataDiagnosis: output failed - unsupported output format')
......
...@@ -18,9 +18,10 @@ class TestDataDiagnosis(unittest.TestCase): ...@@ -18,9 +18,10 @@ class TestDataDiagnosis(unittest.TestCase):
"""Test for DataDiagnosis class.""" """Test for DataDiagnosis class."""
def setUp(self): def setUp(self):
"""Method called to prepare the test fixture.""" """Method called to prepare the test fixture."""
self.output_excel_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.xlsx' self.parent_path = Path(__file__).parent
self.test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml' self.output_excel_file = str(self.parent_path / 'diagnosis_summary.xlsx')
self.output_json_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.jsonl' self.test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
self.output_json_file = str(self.parent_path / 'diagnosis_summary.jsonl')
def tearDown(self): def tearDown(self):
"""Method called after the test method has been called and the result recorded.""" """Method called after the test method has been called and the result recorded."""
...@@ -33,21 +34,31 @@ class TestDataDiagnosis(unittest.TestCase): ...@@ -33,21 +34,31 @@ class TestDataDiagnosis(unittest.TestCase):
"""Test for rule-based data diagnosis.""" """Test for rule-based data diagnosis."""
# Test - read_raw_data and get_metrics_from_raw_data # Test - read_raw_data and get_metrics_from_raw_data
# Positive case # Positive case
test_raw_data = str(Path(__file__).parent.resolve()) + '/test_results.jsonl' test_raw_data = str(self.parent_path / 'test_results.jsonl')
test_rule_file = str(Path(__file__).parent.resolve()) + '/test_rules.yaml' test_rule_file = str(self.parent_path / 'test_rules.yaml')
test_baseline_file = str(Path(__file__).parent.resolve()) + '/test_baseline.json' test_baseline_file = str(self.parent_path / 'test_baseline.json')
diag1 = DataDiagnosis() diag1 = DataDiagnosis()
diag1._raw_data_df = file_handler.read_raw_data(test_raw_data) diag1._raw_data_df = file_handler.read_raw_data(test_raw_data)
diag1._metrics = diag1._get_metrics_by_benchmarks(list(diag1._raw_data_df)) diag1._metrics = diag1._get_metrics_by_benchmarks(list(diag1._raw_data_df))
assert (len(diag1._raw_data_df) == 3) assert (len(diag1._raw_data_df) == 3)
# Negative case # Negative case
test_raw_data_fake = str(Path(__file__).parent.resolve()) + '/test_results_fake.jsonl' test_raw_data_fake = str(self.parent_path / 'test_results_fake.jsonl')
test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml' test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
diag2 = DataDiagnosis() diag2 = DataDiagnosis()
diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake) diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
diag2._metrics = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df)) diag2._metrics = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df))
assert (len(diag2._raw_data_df) == 0) assert (len(diag2._raw_data_df) == 0)
assert (len(diag2._metrics) == 0) assert (len(diag2._metrics) == 0)
metric_list = [
'gpu_temperature', 'gpu_power_limit', 'gemm-flops/FP64',
'bert_models/pytorch-bert-base/steptime_train_float32'
]
self.assertDictEqual(
diag2._get_metrics_by_benchmarks(metric_list), {
'gemm-flops': {'gemm-flops/FP64'},
'bert_models': {'bert_models/pytorch-bert-base/steptime_train_float32'}
}
)
# Test - read rules # Test - read rules
rules = file_handler.read_rules(test_rule_file_fake) rules = file_handler.read_rules(test_rule_file_fake)
assert (not rules) assert (not rules)
...@@ -176,3 +187,27 @@ class TestDataDiagnosis(unittest.TestCase): ...@@ -176,3 +187,27 @@ class TestDataDiagnosis(unittest.TestCase):
assert ('Category' in line) assert ('Category' in line)
assert ('Defective Details' in line) assert ('Defective Details' in line)
assert ('Index' in line) assert ('Index' in line)
def test_data_diagnosis_run(self):
"""Test for the run process of rule-based data diagnosis."""
test_raw_data = str(self.parent_path / 'test_results.jsonl')
test_rule_file = str(self.parent_path / 'test_rules.yaml')
test_baseline_file = str(self.parent_path / 'test_baseline.json')
# Test - output in excel
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'excel')
excel_file = pd.ExcelFile(self.output_excel_file, engine='openpyxl')
data_sheet_name = 'Not Accept'
data_not_accept_read_from_excel = excel_file.parse(data_sheet_name)
expect_result_file = pd.ExcelFile(str(self.parent_path / '../data/diagnosis_summary.xlsx'), engine='openpyxl')
expect_result = expect_result_file.parse(data_sheet_name)
pd.util.testing.assert_frame_equal(data_not_accept_read_from_excel, expect_result)
# Test - output in json
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json')
assert (Path(self.output_json_file).is_file())
with Path(self.output_json_file).open() as f:
data_not_accept_read_from_json = f.read()
expect_result_file = self.parent_path / '../data/diagnosis_summary.jsonl'
with Path(expect_result_file).open() as f:
expect_result = f.read()
assert (data_not_accept_read_from_json == expect_result)
{"Category": "KernelLaunch", "Defective Details": "kernel-launch/event_overhead:0(B/L: 0.0060 VAL: 0.1000 VAR: 1577.85% Rule:lambda x:x>0.05)", "kernel-launch/event_overhead:0": 15.7785234899, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": 0.0, "mem-bw/D2H_Mem_BW:1": 0.012345679, "mem-bw/D2H_Mem_BW:2": 0.0082304527, "mem-bw/D2H_Mem_BW:3": 0.012345679, "mem-bw/D2H_Mem_BW:4": 0.0, "mem-bw/D2H_Mem_BW:5": 0.0, "mem-bw/D2H_Mem_BW:6": -0.0164609053, "mem-bw/D2H_Mem_BW:7": 0.012345679, "mem-bw/H2D_Mem_BW:0": 0.0, "mem-bw/H2D_Mem_BW:1": 0.0078125, "mem-bw/H2D_Mem_BW:2": 0.015625, "mem-bw/H2D_Mem_BW:3": 0.01953125, "mem-bw/H2D_Mem_BW:4": 0.0234375, "mem-bw/H2D_Mem_BW:5": 0.0078125, "mem-bw/H2D_Mem_BW:6": -0.01171875, "mem-bw/H2D_Mem_BW:7": 0.01953125, "mem-bw/return_code": 0.0, "Index": "sb-validation-01"}
{"Category": "FailedTest,Mem", "Defective Details": "mem-bw/D2H_Mem_BW:0_miss,mem-bw/D2H_Mem_BW:1_miss,mem-bw/D2H_Mem_BW:2_miss,mem-bw/D2H_Mem_BW:3_miss,mem-bw/D2H_Mem_BW:4_miss,mem-bw/D2H_Mem_BW:5_miss,mem-bw/D2H_Mem_BW:6_miss,mem-bw/D2H_Mem_BW:7_miss,mem-bw/H2D_Mem_BW:0_miss,mem-bw/H2D_Mem_BW:1_miss,mem-bw/H2D_Mem_BW:2_miss,mem-bw/H2D_Mem_BW:3_miss,mem-bw/H2D_Mem_BW:4_miss,mem-bw/H2D_Mem_BW:5_miss,mem-bw/H2D_Mem_BW:6_miss,mem-bw/H2D_Mem_BW:7_miss,mem-bw/return_code(VAL: 1.0000 Rule:lambda x:x>0)", "kernel-launch/event_overhead:0": 0.0, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": null, "mem-bw/D2H_Mem_BW:1": null, "mem-bw/D2H_Mem_BW:2": null, "mem-bw/D2H_Mem_BW:3": null, "mem-bw/D2H_Mem_BW:4": null, "mem-bw/D2H_Mem_BW:5": null, "mem-bw/D2H_Mem_BW:6": null, "mem-bw/D2H_Mem_BW:7": null, "mem-bw/H2D_Mem_BW:0": null, "mem-bw/H2D_Mem_BW:1": null, "mem-bw/H2D_Mem_BW:2": null, "mem-bw/H2D_Mem_BW:3": null, "mem-bw/H2D_Mem_BW:4": null, "mem-bw/H2D_Mem_BW:5": null, "mem-bw/H2D_Mem_BW:6": null, "mem-bw/H2D_Mem_BW:7": null, "mem-bw/return_code": 1.0, "Index": "sb-validation-03"}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment