Unverified Commit bcf6ea37 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Bug - Fix bugs in data diagnosis (#273)

**Description**
Fix bugs in data diagnosis.

**Major Revision**
- fix package import issue of file_handler
- deal with monitor metrics
- fix typo in output_path
parent ce1481dd
...@@ -5,12 +5,13 @@ ...@@ -5,12 +5,13 @@
import re import re
from typing import Callable from typing import Callable
from pathlib import Path
import pandas as pd import pandas as pd
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
import superbench.analyzer.file_handler as file_handler from superbench.analyzer import file_handler
class DataDiagnosis(): class DataDiagnosis():
...@@ -31,10 +32,15 @@ def _get_metrics_by_benchmarks(self, metrics_list): ...@@ -31,10 +32,15 @@ def _get_metrics_by_benchmarks(self, metrics_list):
""" """
benchmarks_metrics = {} benchmarks_metrics = {}
for metric in metrics_list: for metric in metrics_list:
benchmark = metric.split('/')[0] if '/' not in metric:
if benchmark not in benchmarks_metrics: logger.warning(
benchmarks_metrics[benchmark] = set() 'DataDiagnosis: get_metrics_by_benchmarks - {} does not have benchmark_name'.format(metric)
benchmarks_metrics[benchmark].add(metric) )
else:
benchmark = metric.split('/')[0]
if benchmark not in benchmarks_metrics:
benchmarks_metrics[benchmark] = set()
benchmarks_metrics[benchmark].add(metric)
return benchmarks_metrics return benchmarks_metrics
def _check_rules(self, rule, name): def _check_rules(self, rule, name):
...@@ -133,6 +139,7 @@ def _get_criteria(self, rule_file, baseline_file): ...@@ -133,6 +139,7 @@ def _get_criteria(self, rule_file, baseline_file):
if re.search(metric_regex, metric): if re.search(metric_regex, metric):
self._sb_rules[rule]['metrics'][metric] = self._get_baseline_of_metric(baseline, metric) self._sb_rules[rule]['metrics'][metric] = self._get_baseline_of_metric(baseline, metric)
self._enable_metrics.append(metric) self._enable_metrics.append(metric)
self._enable_metrics.sort()
except Exception as e: except Exception as e:
logger.error('DataDiagnosis: get criteria failed - {}'.format(str(e))) logger.error('DataDiagnosis: get criteria failed - {}'.format(str(e)))
return False return False
...@@ -171,8 +178,8 @@ def _run_diagnosis_rules_for_single_node(self, node): ...@@ -171,8 +178,8 @@ def _run_diagnosis_rules_for_single_node(self, node):
issue_label = True issue_label = True
if issue_label: if issue_label:
# Add category information # Add category information
general_cat_str = ','.join(categories) general_cat_str = ','.join(sorted(list(categories)))
details_cat_str = ','.join(details) details_cat_str = ','.join(sorted((details)))
details_row = [general_cat_str, details_cat_str] details_row = [general_cat_str, details_cat_str]
return details_row, summary_data_row return details_row, summary_data_row
...@@ -236,15 +243,15 @@ def run(self, raw_data_file, rule_file, baseline_file, output_dir, output_format ...@@ -236,15 +243,15 @@ def run(self, raw_data_file, rule_file, baseline_file, output_dir, output_format
try: try:
self._raw_data_df = file_handler.read_raw_data(raw_data_file) self._raw_data_df = file_handler.read_raw_data(raw_data_file)
self._metrics = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns)) self._metrics = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
logger.info('DataDiagnosis: Begin to processe {} nodes'.format(len(self._raw_data_df))) logger.info('DataDiagnosis: Begin to process {} nodes'.format(len(self._raw_data_df)))
data_not_accept_df, label_df = self.run_diagnosis_rules(rule_file, baseline_file) data_not_accept_df, label_df = self.run_diagnosis_rules(rule_file, baseline_file)
logger.info('DataDiagnosis: Processed finished') logger.info('DataDiagnosis: Processed finished')
outpout_path = '' output_path = ''
if output_format == 'excel': if output_format == 'excel':
output_path = output_dir + '/diagnosis_summary.xlsx' output_path = str(Path(output_dir) / 'diagnosis_summary.xlsx')
file_handler.output_excel(self._raw_data_df, data_not_accept_df, outpout_path, self._sb_rules) file_handler.output_excel(self._raw_data_df, data_not_accept_df, output_path, self._sb_rules)
elif output_format == 'json': elif output_format == 'json':
output_path = output_dir + '/diagnosis_summary.jsonl' output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
file_handler.output_json_data_not_accept(data_not_accept_df, output_path) file_handler.output_json_data_not_accept(data_not_accept_df, output_path)
else: else:
logger.error('DataDiagnosis: output failed - unsupported output format') logger.error('DataDiagnosis: output failed - unsupported output format')
......
...@@ -18,9 +18,10 @@ class TestDataDiagnosis(unittest.TestCase): ...@@ -18,9 +18,10 @@ class TestDataDiagnosis(unittest.TestCase):
"""Test for DataDiagnosis class.""" """Test for DataDiagnosis class."""
def setUp(self): def setUp(self):
"""Method called to prepare the test fixture.""" """Method called to prepare the test fixture."""
self.output_excel_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.xlsx' self.parent_path = Path(__file__).parent
self.test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml' self.output_excel_file = str(self.parent_path / 'diagnosis_summary.xlsx')
self.output_json_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.jsonl' self.test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
self.output_json_file = str(self.parent_path / 'diagnosis_summary.jsonl')
def tearDown(self): def tearDown(self):
"""Method called after the test method has been called and the result recorded.""" """Method called after the test method has been called and the result recorded."""
...@@ -33,21 +34,31 @@ def test_data_diagnosis(self): ...@@ -33,21 +34,31 @@ def test_data_diagnosis(self):
"""Test for rule-based data diagnosis.""" """Test for rule-based data diagnosis."""
# Test - read_raw_data and get_metrics_from_raw_data # Test - read_raw_data and get_metrics_from_raw_data
# Positive case # Positive case
test_raw_data = str(Path(__file__).parent.resolve()) + '/test_results.jsonl' test_raw_data = str(self.parent_path / 'test_results.jsonl')
test_rule_file = str(Path(__file__).parent.resolve()) + '/test_rules.yaml' test_rule_file = str(self.parent_path / 'test_rules.yaml')
test_baseline_file = str(Path(__file__).parent.resolve()) + '/test_baseline.json' test_baseline_file = str(self.parent_path / 'test_baseline.json')
diag1 = DataDiagnosis() diag1 = DataDiagnosis()
diag1._raw_data_df = file_handler.read_raw_data(test_raw_data) diag1._raw_data_df = file_handler.read_raw_data(test_raw_data)
diag1._metrics = diag1._get_metrics_by_benchmarks(list(diag1._raw_data_df)) diag1._metrics = diag1._get_metrics_by_benchmarks(list(diag1._raw_data_df))
assert (len(diag1._raw_data_df) == 3) assert (len(diag1._raw_data_df) == 3)
# Negative case # Negative case
test_raw_data_fake = str(Path(__file__).parent.resolve()) + '/test_results_fake.jsonl' test_raw_data_fake = str(self.parent_path / 'test_results_fake.jsonl')
test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml' test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
diag2 = DataDiagnosis() diag2 = DataDiagnosis()
diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake) diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
diag2._metrics = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df)) diag2._metrics = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df))
assert (len(diag2._raw_data_df) == 0) assert (len(diag2._raw_data_df) == 0)
assert (len(diag2._metrics) == 0) assert (len(diag2._metrics) == 0)
metric_list = [
'gpu_temperature', 'gpu_power_limit', 'gemm-flops/FP64',
'bert_models/pytorch-bert-base/steptime_train_float32'
]
self.assertDictEqual(
diag2._get_metrics_by_benchmarks(metric_list), {
'gemm-flops': {'gemm-flops/FP64'},
'bert_models': {'bert_models/pytorch-bert-base/steptime_train_float32'}
}
)
# Test - read rules # Test - read rules
rules = file_handler.read_rules(test_rule_file_fake) rules = file_handler.read_rules(test_rule_file_fake)
assert (not rules) assert (not rules)
...@@ -176,3 +187,27 @@ def test_data_diagnosis(self): ...@@ -176,3 +187,27 @@ def test_data_diagnosis(self):
assert ('Category' in line) assert ('Category' in line)
assert ('Defective Details' in line) assert ('Defective Details' in line)
assert ('Index' in line) assert ('Index' in line)
def test_data_diagnosis_run(self):
"""Test for the run process of rule-based data diagnosis."""
test_raw_data = str(self.parent_path / 'test_results.jsonl')
test_rule_file = str(self.parent_path / 'test_rules.yaml')
test_baseline_file = str(self.parent_path / 'test_baseline.json')
# Test - output in excel
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'excel')
excel_file = pd.ExcelFile(self.output_excel_file, engine='openpyxl')
data_sheet_name = 'Not Accept'
data_not_accept_read_from_excel = excel_file.parse(data_sheet_name)
expect_result_file = pd.ExcelFile(str(self.parent_path / '../data/diagnosis_summary.xlsx'), engine='openpyxl')
expect_result = expect_result_file.parse(data_sheet_name)
pd.util.testing.assert_frame_equal(data_not_accept_read_from_excel, expect_result)
# Test - output in json
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json')
assert (Path(self.output_json_file).is_file())
with Path(self.output_json_file).open() as f:
data_not_accept_read_from_json = f.read()
expect_result_file = self.parent_path / '../data/diagnosis_summary.jsonl'
with Path(expect_result_file).open() as f:
expect_result = f.read()
assert (data_not_accept_read_from_json == expect_result)
{"Category": "KernelLaunch", "Defective Details": "kernel-launch/event_overhead:0(B/L: 0.0060 VAL: 0.1000 VAR: 1577.85% Rule:lambda x:x>0.05)", "kernel-launch/event_overhead:0": 15.7785234899, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": 0.0, "mem-bw/D2H_Mem_BW:1": 0.012345679, "mem-bw/D2H_Mem_BW:2": 0.0082304527, "mem-bw/D2H_Mem_BW:3": 0.012345679, "mem-bw/D2H_Mem_BW:4": 0.0, "mem-bw/D2H_Mem_BW:5": 0.0, "mem-bw/D2H_Mem_BW:6": -0.0164609053, "mem-bw/D2H_Mem_BW:7": 0.012345679, "mem-bw/H2D_Mem_BW:0": 0.0, "mem-bw/H2D_Mem_BW:1": 0.0078125, "mem-bw/H2D_Mem_BW:2": 0.015625, "mem-bw/H2D_Mem_BW:3": 0.01953125, "mem-bw/H2D_Mem_BW:4": 0.0234375, "mem-bw/H2D_Mem_BW:5": 0.0078125, "mem-bw/H2D_Mem_BW:6": -0.01171875, "mem-bw/H2D_Mem_BW:7": 0.01953125, "mem-bw/return_code": 0.0, "Index": "sb-validation-01"}
{"Category": "FailedTest,Mem", "Defective Details": "mem-bw/D2H_Mem_BW:0_miss,mem-bw/D2H_Mem_BW:1_miss,mem-bw/D2H_Mem_BW:2_miss,mem-bw/D2H_Mem_BW:3_miss,mem-bw/D2H_Mem_BW:4_miss,mem-bw/D2H_Mem_BW:5_miss,mem-bw/D2H_Mem_BW:6_miss,mem-bw/D2H_Mem_BW:7_miss,mem-bw/H2D_Mem_BW:0_miss,mem-bw/H2D_Mem_BW:1_miss,mem-bw/H2D_Mem_BW:2_miss,mem-bw/H2D_Mem_BW:3_miss,mem-bw/H2D_Mem_BW:4_miss,mem-bw/H2D_Mem_BW:5_miss,mem-bw/H2D_Mem_BW:6_miss,mem-bw/H2D_Mem_BW:7_miss,mem-bw/return_code(VAL: 1.0000 Rule:lambda x:x>0)", "kernel-launch/event_overhead:0": 0.0, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": null, "mem-bw/D2H_Mem_BW:1": null, "mem-bw/D2H_Mem_BW:2": null, "mem-bw/D2H_Mem_BW:3": null, "mem-bw/D2H_Mem_BW:4": null, "mem-bw/D2H_Mem_BW:5": null, "mem-bw/D2H_Mem_BW:6": null, "mem-bw/D2H_Mem_BW:7": null, "mem-bw/H2D_Mem_BW:0": null, "mem-bw/H2D_Mem_BW:1": null, "mem-bw/H2D_Mem_BW:2": null, "mem-bw/H2D_Mem_BW:3": null, "mem-bw/H2D_Mem_BW:4": null, "mem-bw/H2D_Mem_BW:5": null, "mem-bw/H2D_Mem_BW:6": null, "mem-bw/H2D_Mem_BW:7": null, "mem-bw/return_code": 1.0, "Index": "sb-validation-03"}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment