Unverified Commit 10a79c4e authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Analyzer - Add support for both jsonl and json format in data diagnosis (#388)

**Description**
Add support for both jsonl and json format in data diagnosis.

**Major Revision**
- Add support for both jsonl and json format in data diagnosis


**Minor Revision**
- change related doc
- add jsonl support in cli
parent 626ac0a4
......@@ -193,7 +193,7 @@ sb result diagnosis --baseline-file
| `--decimal-place-value` | 2 | Number of valid decimal places to show in output. Default: 2. |
| `--output-all` | N/A | Output diagnosis results for all nodes. |
| `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. |
| `--output-file-format` | `excel` | Format of output file, 'excel', 'json', 'md' or 'html'. Default: excel. |
| `--output-file-format` | `excel` | Format of output file, 'excel', 'json', 'jsonl', 'md' or 'html'. Default: excel. |
#### Global arguments
......@@ -208,11 +208,16 @@ Run data diagnosis and output the results in excel format:
sb result diagnosis --data-file outputs/results-summary.jsonl --rule-file rule.yaml --baseline-file baseline.json --output-file-format excel
```
Run data diagnosis and output the results in jsonl format:
Run data diagnosis and output the results in json format:
```bash title="SB CLI"
sb result diagnosis --data-file outputs/results-summary.jsonl --rule-file rule.yaml --baseline-file baseline.json --output-file-format json
```
Run data diagnosis and output the results in jsonl format:
```bash title="SB CLI"
sb result diagnosis --data-file outputs/results-summary.jsonl --rule-file rule.yaml --baseline-file baseline.json --output-file-format jsonl
```
Run data diagnosis and output the results in markdown format with 2 valid decimal places:
```bash title="SB CLI"
sb result diagnosis --data-file outputs/results-summary.jsonl --rule-file rule.yaml --baseline-file baseline.json --output-file-format md --decimal-place-value 2
......
......@@ -171,12 +171,18 @@ The function used for this rule.
## Output
We support different output formats for filtering the defective machines including jsonl, excel, etc. The output includes all defective machines' information including index, failure category, failure details, and detailed metrics.
We support different output formats for filtering the defective machines including json, jsonl, excel, md and html.
The output includes all defective machines' information including index, failure category, failure details, and detailed metrics by default.
- index: the name of defective machines.
- Category: categories defined in the rule.
- Category (diagnosis/category in json format): categories defined in the rule.
- Defective Details: all violated metrics including metric data and related rule.
- Defective Details (diagnosis/issue_details in json format): all violated metrics including metric data and related rule.
- ${metric}: the data of the metrics defined in the rule file. If the rule is `variance`, the form of the data is variance in percentage; if the rule is `value`, the form of the data is raw data.
If you specify '--output-all' in the command, the output includes all machines' information and an extra field to indicate if the machines is defective.
- Accept (diagnosis/accept in json format): False if the machine is defective, otherwise True.
......@@ -232,23 +232,23 @@ def output_all_nodes_results(self, raw_data_df, data_not_accept_df):
data_not_accept_df (DataFrame): defective nodes's detailed information
Returns:
DataFrame: all nodes' detailed information inluding ['Accept','Number_Of_Issues','Category','Issue_Details']
DataFrame: all nodes' detailed information inluding ['Accept','Number Of Issues',
'Category','Defective Details']
"""
append_columns = ['Accept', 'Number_Of_Issues', 'Category', 'Issue_Details']
append_columns = ['Accept', 'Number Of Issues', 'Category', 'Defective Details']
all_data_df = (raw_data_df).astype('float64')
if data_not_accept_df.shape[0] == 0:
all_data_df['Accept'] = [True for i in range(len(all_data_df))]
all_data_df['Number_Of_Issues'] = [0 for i in range(len(all_data_df))]
all_data_df['Number Of Issues'] = [0 for i in range(len(all_data_df))]
all_data_df['Category'] = [None for i in range(len(all_data_df))]
all_data_df['Issue_Details'] = [None for i in range(len(all_data_df))]
all_data_df['Defective Details'] = [None for i in range(len(all_data_df))]
elif data_not_accept_df.shape[0] > 0:
data_not_accept_df['Accept'] = [False for i in range(len(data_not_accept_df))]
data_not_accept_df['Number_Of_Issues'] = data_not_accept_df['Defective Details'].map(
data_not_accept_df['Number Of Issues'] = data_not_accept_df['Defective Details'].map(
lambda x: len(x.split(','))
)
data_not_accept_df = data_not_accept_df.rename(columns={'Defective Details': 'Issue_Details'})
for index in range(len(append_columns)):
if append_columns[index] not in data_not_accept_df:
logger.warning(
......@@ -262,8 +262,8 @@ def output_all_nodes_results(self, raw_data_df, data_not_accept_df):
data_not_accept_df[[append_columns[index]]], left_index=True, right_index=True, how='left'
)
all_data_df['Accept'] = all_data_df['Accept'].replace(np.nan, True)
all_data_df['Number_Of_Issues'] = all_data_df['Number_Of_Issues'].replace(np.nan, 0)
all_data_df['Number_Of_Issues'] = all_data_df['Number_Of_Issues'].astype(int)
all_data_df['Number Of Issues'] = all_data_df['Number Of Issues'].replace(np.nan, 0)
all_data_df['Number Of Issues'] = all_data_df['Number Of Issues'].astype(int)
all_data_df = all_data_df.replace(np.nan, '')
......@@ -326,9 +326,9 @@ def output_diagnosis_in_json(self, data_not_accept_df, output_path):
data_not_accept_df['Index'] = data_not_accept_df.index
data_not_accept_df = data_not_accept_df.rename(
columns={
'Issue_Details': 'diagnosis/issue_details',
'Defective Details': 'diagnosis/issue_details',
'Category': 'diagnosis/category',
'Number_Of_Issues': 'diagnosis/issue_num',
'Number Of Issues': 'diagnosis/issue_num',
'Accept': 'diagnosis/accept'
}
)
......@@ -394,31 +394,25 @@ def run(
# read baseline
baseline = file_handler.read_baseline(baseline_file)
logger.info('DataDiagnosis: Begin to process {} nodes'.format(len(self._raw_data_df)))
data_not_accept_df, label_df = self.run_diagnosis_rules(rules, baseline)
output_df, label_df = self.run_diagnosis_rules(rules, baseline)
logger.info('DataDiagnosis: Processed finished')
output_path = ''
output_path = str(Path(output_dir) / f'diagnosis_summary.{output_format}')
# generate all nodes' info
if output_all:
output_path = str(Path(output_dir) / 'diagnosis_summary.json')
data_not_accept_df = self.output_all_nodes_results(self._raw_data_df, data_not_accept_df)
output_df = self.output_all_nodes_results(self._raw_data_df, output_df)
# output according format
if output_format == 'excel':
output_path = str(Path(output_dir) / 'diagnosis_summary.xlsx')
self.output_diagnosis_in_excel(self._raw_data_df, data_not_accept_df, output_path, self._sb_rules)
self.output_diagnosis_in_excel(self._raw_data_df, output_df, output_path, self._sb_rules)
elif output_format == 'json':
if output_all:
output_path = str(Path(output_dir) / 'diagnosis_summary.json')
self.output_diagnosis_in_json(data_not_accept_df, output_path)
else:
output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
self.output_diagnosis_in_jsonl(data_not_accept_df, output_path)
self.output_diagnosis_in_json(output_df, output_path)
elif output_format == 'jsonl':
self.output_diagnosis_in_jsonl(output_df, output_path)
elif output_format == 'md' or output_format == 'html':
lines = self.generate_md_lines(data_not_accept_df, self._sb_rules, round)
lines = self.generate_md_lines(output_df, self._sb_rules, round)
if output_format == 'md':
output_path = str(Path(output_dir) / 'diagnosis_summary.md')
file_handler.output_lines_in_md(lines, output_path)
else:
output_path = str(Path(output_dir) / 'diagnosis_summary.html')
file_handler.output_lines_in_html(lines, output_path)
else:
logger.error('DataDiagnosis: output failed - unsupported output format')
......
......@@ -120,6 +120,13 @@
--baseline-file baseline.json
--output-file-format excel
- name: run data diagnosis and output the results in jsonl format
text: >
{cli_name} result diagnosis
--data-file outputs/results-summary.jsonl
--rule-file rule.yaml
--baseline-file baseline.json
--output-file-format jsonl
- name: run data diagnosis and output the results in json format
text: >
{cli_name} result diagnosis
--data-file outputs/results-summary.jsonl
......
......@@ -35,7 +35,7 @@ def diagnosis_command_handler(
# Create output directory
sb_output_dir = create_sb_output_dir(output_dir)
# Check arguments
supported_output_format = ['excel', 'json', 'md', 'html']
supported_output_format = ['excel', 'json', 'md', 'html', 'jsonl']
if output_file_format not in supported_output_format:
raise CLIError('Output format must be in {}.'.format(str(supported_output_format)))
check_argument_file('raw_data_file', raw_data_file)
......
......@@ -21,7 +21,8 @@ def setUp(self):
self.parent_path = Path(__file__).parent
self.output_excel_file = str(self.parent_path / 'diagnosis_summary.xlsx')
self.test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
self.output_json_file = str(self.parent_path / 'diagnosis_summary.jsonl')
self.output_json_file = str(self.parent_path / 'diagnosis_summary.json')
self.output_jsonl_file = str(self.parent_path / 'diagnosis_summary.jsonl')
self.output_md_file = str(self.parent_path / 'diagnosis_summary.md')
self.output_html_file = str(self.parent_path / 'diagnosis_summary.html')
self.output_all_json_file = str(self.parent_path / 'diagnosis_summary.json')
......@@ -29,8 +30,8 @@ def setUp(self):
def tearDown(self):
"""Method called after the test method has been called and the result recorded."""
for file in [
self.output_excel_file, self.output_json_file, self.test_rule_file_fake, self.output_md_file,
self.output_html_file, self.output_all_json_file
self.output_excel_file, self.output_json_file, self.output_jsonl_file, self.test_rule_file_fake,
self.output_md_file, self.output_html_file, self.output_all_json_file
]:
p = Path(file)
if p.is_file():
......@@ -212,7 +213,7 @@ def test_data_diagnosis(self):
assert (data_df.loc['sb-validation-02']['Accept'])
assert (not data_df.loc['sb-validation-03']['Accept'])
assert ('Category' in data_df)
assert ('Issue_Details' in data_df)
assert ('Defective Details' in data_df)
# case 1: 3 accept, 0 not accept
data_df_all_accept = diag1.output_all_nodes_results(diag1._raw_data_df, pd.DataFrame())
assert (len(data_df_all_accept) == 3)
......@@ -248,10 +249,19 @@ def test_data_diagnosis_run(self):
assert (Path(self.output_json_file).is_file())
with Path(self.output_json_file).open() as f:
data_not_accept_read_from_json = f.read()
expect_result_file = self.parent_path / '../data/diagnosis_summary.jsonl'
expect_result_file = self.parent_path / '../data/diagnosis_summary_json.json'
with Path(expect_result_file).open() as f:
expect_result = f.read()
assert (data_not_accept_read_from_json == expect_result)
# Test - output in jsonl
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'jsonl')
assert (Path(self.output_jsonl_file).is_file())
with Path(self.output_jsonl_file).open() as f:
data_not_accept_read_from_jsonl = f.read()
expect_result_file = self.parent_path / '../data/diagnosis_summary.jsonl'
with Path(expect_result_file).open() as f:
expect_result = f.read()
assert (data_not_accept_read_from_jsonl == expect_result)
# Test - output in md
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'md', round=2)
assert (Path(self.output_md_file).is_file())
......
[
{
"diagnosis/category": "KernelLaunch",
"diagnosis/issue_details": "kernel-launch/event_overhead:0(B/L: 0.0060 VAL: 0.1000 VAR: 1577.85% Rule:lambda x:x>0.05)",
"kernel-launch/event_overhead:0": 15.7785234899,
"kernel-launch/event_overhead:1": -0.0016778523,
"kernel-launch/event_overhead:2": -0.0654362416,
"kernel-launch/event_overhead:3": -0.0771812081,
"kernel-launch/event_overhead:4": -0.0067114094,
"kernel-launch/event_overhead:5": -0.0117449664,
"kernel-launch/event_overhead:6": -0.0402684564,
"kernel-launch/event_overhead:7": -0.0100671141,
"kernel-launch/return_code": 0.0,
"kernel-launch/wall_overhead:0": 0.0,
"kernel-launch/wall_overhead:1": 0.0,
"kernel-launch/wall_overhead:2": 0.0194931774,
"kernel-launch/wall_overhead:3": 0.022417154,
"kernel-launch/wall_overhead:4": 0.0360623782,
"kernel-launch/wall_overhead:5": -0.0194931774,
"kernel-launch/wall_overhead:6": 0.0185185185,
"kernel-launch/wall_overhead:7": 0.0438596491,
"mem-bw/D2H_Mem_BW:0": 0.0,
"mem-bw/D2H_Mem_BW:1": 0.012345679,
"mem-bw/D2H_Mem_BW:2": 0.0082304527,
"mem-bw/D2H_Mem_BW:3": 0.012345679,
"mem-bw/D2H_Mem_BW:4": 0.0,
"mem-bw/D2H_Mem_BW:5": 0.0,
"mem-bw/D2H_Mem_BW:6": -0.0164609053,
"mem-bw/D2H_Mem_BW:7": 0.012345679,
"mem-bw/H2D_Mem_BW:0": 0.0,
"mem-bw/H2D_Mem_BW:1": 0.0078125,
"mem-bw/H2D_Mem_BW:2": 0.015625,
"mem-bw/H2D_Mem_BW:3": 0.01953125,
"mem-bw/H2D_Mem_BW:4": 0.0234375,
"mem-bw/H2D_Mem_BW:5": 0.0078125,
"mem-bw/H2D_Mem_BW:6": -0.01171875,
"mem-bw/H2D_Mem_BW:7": 0.01953125,
"mem-bw/return_code": 0.0,
"Index": "sb-validation-01"
},
{
"diagnosis/category": "FailedTest",
"diagnosis/issue_details": "mem-bw/D2H_Mem_BW:0_miss,mem-bw/D2H_Mem_BW:1_miss,mem-bw/D2H_Mem_BW:2_miss,mem-bw/D2H_Mem_BW:3_miss,mem-bw/D2H_Mem_BW:4_miss,mem-bw/D2H_Mem_BW:5_miss,mem-bw/D2H_Mem_BW:6_miss,mem-bw/D2H_Mem_BW:7_miss,mem-bw/H2D_Mem_BW:0_miss,mem-bw/H2D_Mem_BW:1_miss,mem-bw/H2D_Mem_BW:2_miss,mem-bw/H2D_Mem_BW:3_miss,mem-bw/H2D_Mem_BW:4_miss,mem-bw/H2D_Mem_BW:5_miss,mem-bw/H2D_Mem_BW:6_miss,mem-bw/H2D_Mem_BW:7_miss,mem-bw/return_code(VAL: 1.0000 Rule:lambda x:x>0)",
"kernel-launch/event_overhead:0": 0.0,
"kernel-launch/event_overhead:1": -0.0016778523,
"kernel-launch/event_overhead:2": -0.0654362416,
"kernel-launch/event_overhead:3": -0.0771812081,
"kernel-launch/event_overhead:4": -0.0067114094,
"kernel-launch/event_overhead:5": -0.0117449664,
"kernel-launch/event_overhead:6": -0.0402684564,
"kernel-launch/event_overhead:7": -0.0100671141,
"kernel-launch/return_code": 0.0,
"kernel-launch/wall_overhead:0": 0.0,
"kernel-launch/wall_overhead:1": 0.0,
"kernel-launch/wall_overhead:2": 0.0194931774,
"kernel-launch/wall_overhead:3": 0.022417154,
"kernel-launch/wall_overhead:4": 0.0360623782,
"kernel-launch/wall_overhead:5": -0.0194931774,
"kernel-launch/wall_overhead:6": 0.0185185185,
"kernel-launch/wall_overhead:7": 0.0438596491,
"mem-bw/D2H_Mem_BW:0": null,
"mem-bw/D2H_Mem_BW:1": null,
"mem-bw/D2H_Mem_BW:2": null,
"mem-bw/D2H_Mem_BW:3": null,
"mem-bw/D2H_Mem_BW:4": null,
"mem-bw/D2H_Mem_BW:5": null,
"mem-bw/D2H_Mem_BW:6": null,
"mem-bw/D2H_Mem_BW:7": null,
"mem-bw/H2D_Mem_BW:0": null,
"mem-bw/H2D_Mem_BW:1": null,
"mem-bw/H2D_Mem_BW:2": null,
"mem-bw/H2D_Mem_BW:3": null,
"mem-bw/H2D_Mem_BW:4": null,
"mem-bw/H2D_Mem_BW:5": null,
"mem-bw/H2D_Mem_BW:6": null,
"mem-bw/H2D_Mem_BW:7": null,
"mem-bw/return_code": 1.0,
"Index": "sb-validation-03"
}
]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment