diagnosis_rule_op.py 8.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""A module for data diagnosis rule ops."""

from typing import Dict, Callable

import pandas as pd

from superbench.benchmarks.context import Enum
from superbench.common.utils import logger


class DiagnosisRuleType(Enum):
    """The Enum class representing different rule ops."""

    VARIANCE = 'variance'
    VALUE = 'value'
19
    MULTI_RULES = 'multi_rules'
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57


class RuleOp:
    """RuleOp class to maintain all rule functions."""

    functions: Dict[DiagnosisRuleType, Callable] = dict()

    @classmethod
    def add_rule_func(cls, rule_type):
        """Add rule fuction.

        Args:
            rule_type (DiagnosisRuleType): The type of rule function.

        Return:
            decorator (Callable): return the decorator to add the rule function.
        """
        def decorator(func):
            cls.functions[rule_type] = func
            return func

        return decorator

    @classmethod
    def get_rule_func(cls, rule_type):
        """Get rule fuction by rule_type.

        Args:
            rule_type (DiagnosisRuleType): The type of rule function.

        Return:
            func (Callable): rule function, None means invalid rule type.
        """
        if rule_type in cls.functions:
            return cls.functions[rule_type]

        return None

58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
    @staticmethod
    def check_criterion_with_a_value(rule):
        """Check if the criterion is valid with a numeric variable and return bool type.

        Args:
            rule (dict): rule including function, criteria, metrics with their baseline values and categories
        """
        # parse criteria and check if valid
        if not isinstance(eval(rule['criteria'])(0), bool):
            logger.log_and_raise(exception=Exception, msg='invalid criteria format')

    @staticmethod
    def miss_test(metric, rule, data_row, details, categories):
        """Check if the metric in the rule missed test and if so add details and categories.

        Args:
            metric (str): the name of the metric
            data_row (pd.Series): raw data of the metrics
            rule (dict): rule including function, criteria, metrics with their baseline values and categories
            details (list): details about violated rules and related data
            categories (set): categories of violated rules

        Returns:
            bool: if the metric in the rule missed test, return True, otherwise return False
        """
        # metric not in raw_data or the value is none, miss test
        if metric not in data_row or pd.isna(data_row[metric]):
            RuleOp.add_categories_and_details(metric + '_miss', rule['categories'], details, categories)
            return True
        return False

    @staticmethod
    def add_categories_and_details(detail, category, details, categories):
        """Add details and categories.

        Args:
            detail (str): violated rule and related data
            category (str): category of violated rule
            details (list): list of details about violated rules and related data
            categories (set): set of categories of violated rules
        """
        details.append(detail)
        categories.add(category)

102
103
104
105
106
107
    @staticmethod
    def variance(data_row, rule, summary_data_row, details, categories):
        """Rule op function of variance.

        Each metric in the rule will calculate the variance (val - baseline / baseline),
        and use criteria in the rule to determine whether metric's variance meet the criteria,
108
        if any metric meet the criteria, the rule is not passed.
109
110
111
112
113

        Args:
            data_row (pd.Series): raw data of the metrics
            rule (dict): rule including function, criteria, metrics with their baseline values and categories
            summary_data_row (pd.Series): results of the metrics processed after the function
114
            details (list): details about violated rules and related data
115
116
117
            categories (set): categories of violated rules

        Returns:
118
            number: the number of the metrics that violate the rule if the rule is not passed, otherwise 0
119
        """
120
121
        violated_metric_num = 0
        RuleOp.check_criterion_with_a_value(rule)
122
123
124
        # every metric should pass the rule
        for metric in rule['metrics']:
            # metric not in raw_data or the value is none, miss test
125
126
            if RuleOp.miss_test(metric, rule, data_row, details, categories):
                violated_metric_num += 1
127
            else:
128
                violate_metric = False
129
130
131
132
133
134
135
136
137
138
                # check if metric pass the rule
                val = data_row[metric]
                baseline = rule['metrics'][metric]
                if baseline == 0:
                    logger.log_and_raise(exception=Exception, msg='invalid baseline 0 in variance rule')
                var = (val - baseline) / baseline
                summary_data_row[metric] = var
                violate_metric = eval(rule['criteria'])(var)
                # add issued details and categories
                if violate_metric:
139
                    violated_metric_num += 1
140
141
142
                    info = '(B/L: {:.4f} VAL: {:.4f} VAR: {:.2f}% Rule:{})'.format(
                        baseline, val, var * 100, rule['criteria']
                    )
143
144
                    RuleOp.add_categories_and_details(metric + info, rule['categories'], details, categories)
        return violated_metric_num
145
146
147
148
149
150
151

    @staticmethod
    def value(data_row, rule, summary_data_row, details, categories):
        """Rule op function of value.

        Each metric in the rule will use criteria in the rule
        to determine whether metric's value meet the criteria,
152
        if any metric meet the criteria, the rule is not passed.
153
154
155
156
157

        Args:
            data_row (pd.Series): raw data of the metrics
            rule (dict): rule including function, criteria, metrics with their baseline values and categories
            summary_data_row (pd.Series): results of the metrics processed after the function
158
            details (list): details about violated rules and related data
159
160
161
            categories (set): categories of violated rules

        Returns:
162
            number: the number of the metrics that violate the rule if the rule is not passed, otherwise 0
163
        """
164
        violated_metric_num = 0
165
        # parse criteria and check if valid
166
        RuleOp.check_criterion_with_a_value(rule)
167
168
169
        # every metric should pass the rule
        for metric in rule['metrics']:
            # metric not in raw_data or the value is none, miss test
170
171
            if RuleOp.miss_test(metric, rule, data_row, details, categories):
                violated_metric_num += 1
172
            else:
173
                violate_metric = False
174
175
176
177
178
179
                # check if metric pass the rule
                val = data_row[metric]
                summary_data_row[metric] = val
                violate_metric = eval(rule['criteria'])(val)
                # add issued details and categories
                if violate_metric:
180
                    violated_metric_num += 1
181
                    info = '(VAL: {:.4f} Rule:{})'.format(val, rule['criteria'])
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
                    RuleOp.add_categories_and_details(metric + info, rule['categories'], details, categories)
        return violated_metric_num

    @staticmethod
    def multi_rules(rule, details, categories, violation):
        """Rule op function of multi_rules.

        The criteria in this rule will use the combined results of multiple previous rules and their metrics
        which has been stored in advance to determine whether this rule is passed.

        Args:
            rule (dict): rule including function, criteria, metrics with their baseline values and categories
            details (list): details about violated rules and related data
            categories (set): categories of violated rules
            violation (dict): the number of the metrics that violate the rules
        Returns:
            number: 0 if the rule is passed, otherwise 1
        """
        violated = eval(rule['criteria'])(violation)
        if not isinstance(violated, bool):
            logger.log_and_raise(exception=Exception, msg='invalid upper criteria format')
        if violated:
            info = '{}:{}'.format(rule['name'], rule['criteria'])
            RuleOp.add_categories_and_details(info, rule['categories'], details, categories)
        return 1 if violated else 0
207
208
209
210


RuleOp.add_rule_func(DiagnosisRuleType.VARIANCE)(RuleOp.variance)
RuleOp.add_rule_func(DiagnosisRuleType.VALUE)(RuleOp.value)
211
RuleOp.add_rule_func(DiagnosisRuleType.MULTI_RULES)(RuleOp.multi_rules)