"examples/vision/vscode:/vscode.git/clone" did not exist on "1b098fd74872aaac08c18d478c5f2fe1439e140e"
Unverified Commit 733860d7 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Analyzer - Add support to store values of metrics in data diagnosis (#392)

**Description**
Add support to store values of metrics in data diagnosis.

Take the following rules as example: 
```
    nccl_store_rule:
      categories: NCCL_DIS
      store: True
      metrics:
        - nccl-bw:allreduce-run0/allreduce_1073741824_busbw
        - nccl-bw:allreduce-run1/allreduce_1073741824_busbw
        - nccl-bw:allreduce-run2/allreduce_1073741824_busbw
        - nccl-bw:allreduce-run3/allreduce_1073741824_busbw
        - nccl-bw:allreduce-run4/allreduce_1073741824_busbw
    nccl_rule:
      function: multi_rules
      criteria: 'lambda label:True if min(label["nccl_store_rule"].values())/max(label["nccl_store_rule"].values())<0.95 else False'
      categories: NCCL_DIS
```
**nccl_store_rule** will store the values of the metrics in dict and save them into `label["nccl_store_rule"]` , and then **rccl_rule** can use the values of metrics through `label["nccl_store_rule"].values()` in criteria
parent 10a79c4e
...@@ -34,18 +34,19 @@ def _check_and_format_rules(self, rule, name): ...@@ -34,18 +34,19 @@ def _check_and_format_rules(self, rule, name):
""" """
# check if rule is supported # check if rule is supported
super()._check_and_format_rules(rule, name) super()._check_and_format_rules(rule, name)
if 'function' not in rule: if 'store' not in rule:
logger.log_and_raise(exception=Exception, msg='{} lack of function'.format(name)) if 'function' not in rule:
if not isinstance(DiagnosisRuleType(rule['function']), DiagnosisRuleType): logger.log_and_raise(exception=Exception, msg='{} lack of function'.format(name))
logger.log_and_raise(exception=Exception, msg='{} invalid function name'.format(name)) if not isinstance(DiagnosisRuleType(rule['function']), DiagnosisRuleType):
# check rule format logger.log_and_raise(exception=Exception, msg='{} invalid function name'.format(name))
if 'criteria' not in rule: # check rule format
logger.log_and_raise(exception=Exception, msg='{} lack of criteria'.format(name)) if 'criteria' not in rule:
if not isinstance(eval(rule['criteria']), Callable): logger.log_and_raise(exception=Exception, msg='{} lack of criteria'.format(name))
logger.log_and_raise(exception=Exception, msg='invalid criteria format') if not isinstance(eval(rule['criteria']), Callable):
if rule['function'] != 'multi_rules': logger.log_and_raise(exception=Exception, msg='invalid criteria format')
if 'metrics' not in rule: if rule['function'] != 'multi_rules':
logger.log_and_raise(exception=Exception, msg='{} lack of metrics'.format(name)) if 'metrics' not in rule:
logger.log_and_raise(exception=Exception, msg='{} lack of metrics'.format(name))
if 'store' in rule and not isinstance(rule['store'], bool): if 'store' in rule and not isinstance(rule['store'], bool):
logger.log_and_raise(exception=Exception, msg='{} store must be bool type'.format(name)) logger.log_and_raise(exception=Exception, msg='{} store must be bool type'.format(name))
return rule return rule
...@@ -117,10 +118,12 @@ def _parse_rules_and_baseline(self, rules, baseline): ...@@ -117,10 +118,12 @@ def _parse_rules_and_baseline(self, rules, baseline):
benchmark_rules[rule] = self._check_and_format_rules(benchmark_rules[rule], rule) benchmark_rules[rule] = self._check_and_format_rules(benchmark_rules[rule], rule)
self._sb_rules[rule] = {} self._sb_rules[rule] = {}
self._sb_rules[rule]['name'] = rule self._sb_rules[rule]['name'] = rule
self._sb_rules[rule]['function'] = benchmark_rules[rule]['function'] if 'function' in benchmark_rules[rule]:
self._sb_rules[rule]['function'] = benchmark_rules[rule]['function']
self._sb_rules[rule]['store'] = True if 'store' in benchmark_rules[ self._sb_rules[rule]['store'] = True if 'store' in benchmark_rules[
rule] and benchmark_rules[rule]['store'] is True else False rule] and benchmark_rules[rule]['store'] is True else False
self._sb_rules[rule]['criteria'] = benchmark_rules[rule]['criteria'] if 'criteria' in benchmark_rules[rule]:
self._sb_rules[rule]['criteria'] = benchmark_rules[rule]['criteria']
self._sb_rules[rule]['categories'] = benchmark_rules[rule]['categories'] self._sb_rules[rule]['categories'] = benchmark_rules[rule]['categories']
self._sb_rules[rule]['metrics'] = {} self._sb_rules[rule]['metrics'] = {}
self.__get_metrics_and_baseline(rule, benchmark_rules, baseline) self.__get_metrics_and_baseline(rule, benchmark_rules, baseline)
...@@ -151,16 +154,22 @@ def _run_diagnosis_rules_for_single_node(self, node): ...@@ -151,16 +154,22 @@ def _run_diagnosis_rules_for_single_node(self, node):
issue_label = False issue_label = False
details = [] details = []
categories = set() categories = set()
violation = {} store_values = {}
summary_data_row = pd.Series(index=self._enable_metrics, name=node, dtype=float) summary_data_row = pd.Series(index=self._enable_metrics, name=node, dtype=float)
# Check each rule # Check each rule
for rule in self._sb_rules: for rule in self._sb_rules:
# if no criteria and store is True in a rule, store the value of metrics in the rule
if self._sb_rules[rule]['store'] and 'criteria' not in self._sb_rules[rule]:
store_values[rule] = {}
for metric in self._sb_rules[rule]['metrics']:
store_values[rule][metric] = data_row[metric]
continue
# Get rule op function and run the rule # Get rule op function and run the rule
function_name = self._sb_rules[rule]['function'] function_name = self._sb_rules[rule]['function']
rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name)) rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name))
violated_num = 0 violated_num = 0
if rule_op == RuleOp.multi_rules: if rule_op == RuleOp.multi_rules:
violated_num = rule_op(self._sb_rules[rule], details, categories, violation) violated_num = rule_op(self._sb_rules[rule], details, categories, store_values)
elif rule_op == RuleOp.failure_check: elif rule_op == RuleOp.failure_check:
violated_num = rule_op( violated_num = rule_op(
data_row, self._sb_rules[rule], summary_data_row, details, categories, self._raw_rules[rule] data_row, self._sb_rules[rule], summary_data_row, details, categories, self._raw_rules[rule]
...@@ -169,7 +178,7 @@ def _run_diagnosis_rules_for_single_node(self, node): ...@@ -169,7 +178,7 @@ def _run_diagnosis_rules_for_single_node(self, node):
violated_num = rule_op(data_row, self._sb_rules[rule], summary_data_row, details, categories) violated_num = rule_op(data_row, self._sb_rules[rule], summary_data_row, details, categories)
# label the node as defective one # label the node as defective one
if self._sb_rules[rule]['store']: if self._sb_rules[rule]['store']:
violation[rule] = violated_num store_values[rule] = violated_num
elif violated_num: elif violated_num:
issue_label = True issue_label = True
if issue_label: if issue_label:
......
...@@ -188,7 +188,7 @@ def value(data_row, rule, summary_data_row, details, categories): ...@@ -188,7 +188,7 @@ def value(data_row, rule, summary_data_row, details, categories):
return violated_metric_num return violated_metric_num
@staticmethod @staticmethod
def multi_rules(rule, details, categories, violation): def multi_rules(rule, details, categories, store_values):
"""Rule op function of multi_rules. """Rule op function of multi_rules.
The criteria in this rule will use the combined results of multiple previous rules and their metrics The criteria in this rule will use the combined results of multiple previous rules and their metrics
...@@ -198,11 +198,12 @@ def multi_rules(rule, details, categories, violation): ...@@ -198,11 +198,12 @@ def multi_rules(rule, details, categories, violation):
rule (dict): rule including function, criteria, metrics with their baseline values and categories rule (dict): rule including function, criteria, metrics with their baseline values and categories
details (list): details about violated rules and related data details (list): details about violated rules and related data
categories (set): categories of violated rules categories (set): categories of violated rules
violation (dict): the number of the metrics that violate the rules store_values (dict): including the number of the metrics that violate the rule, and the values of
the metrics for the rules with 'store' True
Returns: Returns:
number: 0 if the rule is passed, otherwise 1 number: 0 if the rule is passed, otherwise 1
""" """
violated = eval(rule['criteria'])(violation) violated = eval(rule['criteria'])(store_values)
if not isinstance(violated, bool): if not isinstance(violated, bool):
logger.log_and_raise(exception=Exception, msg='invalid upper criteria format') logger.log_and_raise(exception=Exception, msg='invalid upper criteria format')
if violated: if violated:
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import numpy as np
from superbench.analyzer import DataDiagnosis from superbench.analyzer import DataDiagnosis
import superbench.analyzer.file_handler as file_handler import superbench.analyzer.file_handler as file_handler
...@@ -369,6 +370,50 @@ def test_mutli_rules(self): ...@@ -369,6 +370,50 @@ def test_mutli_rules(self):
'rule3:lambda label:True if label["rule1"]+label["rule2"]>=2 else False' 'rule3:lambda label:True if label["rule1"]+label["rule2"]>=2 else False'
) )
# Test multi-rule using values of metrics in criteria lambda expression
diag1 = DataDiagnosis()
# test _run_diagnosis_rules_for_single_node
rules = {
'superbench': {
'rules': {
'rule1': {
'categories':
'NCCL_DIS',
'store':
True,
'metrics': [
'nccl-bw:allreduce-run0/allreduce_1073741824_busbw',
'nccl-bw:allreduce-run1/allreduce_1073741824_busbw',
'nccl-bw:allreduce-run2/allreduce_1073741824_busbw'
]
},
'rule2': {
'categories': 'NCCL_DIS',
'criteria': 'lambda label:True if min(label["rule1"].values())' + '/' +
'max(label["rule1"].values())<0.95 else False',
'function': 'multi_rules'
}
}
}
}
baseline = {}
data = {
'nccl-bw:allreduce-run0/allreduce_1073741824_busbw': [10, 22, 10],
'nccl-bw:allreduce-run1/allreduce_1073741824_busbw': [23, 23, np.nan],
'nccl-bw:allreduce-run2/allreduce_1073741824_busbw': [22, 22, np.nan]
}
diag1._raw_data_df = pd.DataFrame(data, index=['sb-validation-04', 'sb-validation-05', 'sb-validation-06'])
diag1._benchmark_metrics_dict = diag1._get_metrics_by_benchmarks(list(diag1._raw_data_df.columns))
diag1._parse_rules_and_baseline(rules, baseline)
(details_row, summary_data_row) = diag1._run_diagnosis_rules_for_single_node('sb-validation-04')
assert (details_row)
assert ('NCCL_DIS' in details_row[0])
(details_row, summary_data_row) = diag1._run_diagnosis_rules_for_single_node('sb-validation-05')
assert (not details_row)
(details_row, summary_data_row) = diag1._run_diagnosis_rules_for_single_node('sb-validation-06')
assert (not details_row)
def test_failure_check(self): def test_failure_check(self):
"""Test failure test check feature.""" """Test failure test check feature."""
diag1 = DataDiagnosis() diag1 = DataDiagnosis()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment