Release - SuperBench v0.6.0 (#409)

**Description** Cherry-pick bug fixes from v0.6.0 to main. **Major Revisions** * Enable latency test in ib traffic validation distributed benchmark (#396) * Enhance parameter parsing to allow spaces in value (#397) * Update apt packages in dockerfile (#398) * Upgrade colorlog for NO_COLOR support (#404) * Analyzer - Update error handling to support exit code of sb result diagnosis (#403) * Analyzer - Make baseline file optional in data diagnosis and fix bugs (#399) * Enhance timeout cleanup to avoid possible hanging (#405) * Auto generate ibstat file by pssh (#402) * Analyzer - Format int type and unify empty value to N/A in diagnosis output file (#406) * Docs - Upgrade version and release note (#407) * Docs - Fix issues in document (#408) Co-authored-by: Yang Wang <yangwang1@microsoft.com> Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>

Release - SuperBench v0.6.0 (#409)
**Description** Cherry-pick bug fixes from v0.6.0 to main. **Major Revisions** * Enable latency test in ib traffic validation distributed benchmark (#396) * Enhance parameter parsing to allow spaces in value (#397) * Update apt packages in dockerfile (#398) * Upgrade colorlog for NO_COLOR support (#404) * Analyzer - Update error handling to support exit code of sb result diagnosis (#403) * Analyzer - Make baseline file optional in data diagnosis and fix bugs (#399) * Enhance timeout cleanup to avoid possible hanging (#405) * Auto generate ibstat file by pssh (#402) * Analyzer - Format int type and unify empty value to N/A in diagnosis output file (#406) * Docs - Upgrade version and release note (#407) * Docs - Fix issues in document (#408) Co-authored-by: Yang Wang <yangwang1@microsoft.com> Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>
63e9b2d1 · Yifan Xiong · GitHub · 733860d7 · 63e9b2d1 · 63e9b2d1
Unverified Commit 63e9b2d1 authored Sep 06, 2022 by Yifan Xiong Committed by GitHub Sep 06, 2022
20 changed files
--- a/superbench/cli/_result_handler.py
+++ b/superbench/cli/_result_handler.py
@@ -14,7 +14,7 @@ from superbench.cli._handler import check_argument_file
 def diagnosis_command_handler(
    raw_data_file,
    rule_file,
-    baseline_file,
+    baseline_file=None,
    output_dir=None,
    output_file_format='excel',
    output_all=False,
@@ -40,7 +40,8 @@ def diagnosis_command_handler(
            raise CLIError('Output format must be in {}.'.format(str(supported_output_format)))
        check_argument_file('raw_data_file', raw_data_file)
        check_argument_file('rule_file', rule_file)
-        check_argument_file('baseline_file', baseline_file)
+        if baseline_file:
+            check_argument_file('baseline_file', baseline_file)
        # Run data diagnosis
        DataDiagnosis().run(
            raw_data_file, rule_file, baseline_file, sb_output_dir, output_file_format, output_all, decimal_place_value

--- a/superbench/common/utils/logging.py
+++ b/superbench/common/utils/logging.py
@@ -8,6 +8,13 @@ import logging
 import sys

 import colorlog
+# workaround to get rid of isatty from
+# colorama StreamWrapper in WSL2
+try:
+    from colorama import deinit
+    deinit()
+except Exception:
+    pass


 class LoggerAdapter(logging.LoggerAdapter):

--- a/superbench/common/utils/topo_aware.py
+++ b/superbench/common/utils/topo_aware.py
@@ -6,6 +6,7 @@
 import re
 import os
 from pathlib import Path
+from time import sleep

 import networkx as nx

@@ -35,37 +36,34 @@ class quick_regexp(object):
        return self.matched


-def gen_ibstat_file(ibstat_file):
-    """Generate ibstat file for each node with specified path.
+def gen_ibstat_file(host_list, ibstat_file):
+    """Generate ibstat file in each node with specified path.

    Args:
+        host_list (list): list of VM read from hostfile.
        ibstat_file (str): path of ibstat output.
    """
-    from mpi4py import MPI
-
-    if not MPI.Is_initialized():
-        MPI.Init()
-
-    comm = MPI.COMM_WORLD
-    name = MPI.Get_processor_name()
-
-    # The command to fetch ibstat info
-    cmd = r"ibstat | grep -Po 'System image GUID: \K\S+$'"
-    output = os.popen(cmd)
-    ibstat = 'VM_hostname ' + name + '\n' + str(output.read())
-
-    # Fetch all ibstate from each node
-    ibstats = comm.allgather(ibstat)
-
-    ibstate_file_path = Path(ibstat_file)
-
-    # Filter the duplicate info
-    ibstat_infos = set(ibstats)
-
-    with ibstate_file_path.open(mode='w') as f:
-        for ibstat_info in ibstat_infos:
-            f.write(ibstat_info)
-    MPI.Finalize()
+    try:
+        # Only exec on rank0
+        if os.environ.get('OMPI_COMM_WORLD_NODE_RANK') == '0' and os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') == '0':
+            pssh_cmd = "pssh -i -t 5 -p 512 -x '-o StrictHostKeyChecking=no' -H '{}' ".format(' '.join(host_list))
+            cmd = "'cat /sys/class/infiniband/*/sys_image_guid | tr -d :'" \
+                r"| sed -e 's/^.*\[SUCCESS\]/VM_hostname/g;s/^.*\[FAILURE\]/VM_hostname/g' | cut -d ' ' -f 1,2"
+            output = os.popen(pssh_cmd + cmd).read()
+            # Generate ibstat file
+            ibstate_file_path = Path(ibstat_file)
+            with ibstate_file_path.open(mode='w') as f:
+                f.write(output)
+            scp_cmd = "pscp -t 5 -p 512 -H '{0}' {1} {1}".format(' '.join(host_list), ibstat_file)
+            # Distribute ibstat file for others
+            errorn = os.system(scp_cmd)
+            if errorn != 0:
+                logger.error('Failed to distribute ibstate file')
+        else:
+            # Wait for rank0 done
+            sleep(5)
+    except BaseException as e:
+        logger.error('Failed to generate ibstate file, message: {}.'.format(str(e)))


 def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist, max_dist):    # noqa: C901
@@ -91,7 +89,9 @@ def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist,

    if not ibstat_file:
        ibstat_file = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'ib_traffic_topo_aware_ibstat.txt')
-        gen_ibstat_file(ibstat_file)
+        gen_ibstat_file(host_list, ibstat_file)
+        # sync all the rank
+        sleep(5)

    if not Path(ibstat_file).exists():
        logger.error('ibstat file does not exist.')
@@ -125,8 +125,8 @@ def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist,
                    r = quick_regexp()
                    if r.search(r'^(VM_hostname)\s+(.+)', line):
                        vmhost = r.groups[1]
-                    elif r.search(r'^(0x)(.+)', line):
-                        sysimgguid = r.groups[1]
+                    elif r.search(r'^(?!0{16})([a-f0-9]{16})$', line):
+                        sysimgguid = r.groups[0]
                        sysimgguid_to_vmhost[sysimgguid] = vmhost
    except BaseException as e:
        logger.error('Failed to read ibstate file, message: {}.'.format(str(e)))

--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@@ -3,7 +3,7 @@
 # Server:
 #   - Product: HPE Apollo 6500

-version: v0.5
+version: v0.6
 superbench:
  enable: null
  var:

--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@@ -4,7 +4,7 @@
 #   - Product: G482-Z53
 #   - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html

-version: v0.5
+version: v0.6
 superbench:
  enable: null
  var:

--- a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
+++ b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
-version: v0.5
+version: v0.6
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
+++ b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
-version: v0.5
+version: v0.6
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
+++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
-version: v0.5
+version: v0.6
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@@ -3,7 +3,7 @@
 # Azure NDm A100 v4
 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series

-version: v0.5
+version: v0.6
 superbench:
  enable: null
  monitor:

--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
 # SuperBench Config
-version: v0.5
+version: v0.6
 superbench:
  enable: null
  monitor:

--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
 # SuperBench Config
-version: v0.5
+version: v0.6
 superbench:
  enable: null
  monitor:

--- a/superbench/runner/playbooks/cleanup.yaml
+++ b/superbench/runner/playbooks/cleanup.yaml
+- name: Runtime Environment Cleanup
+  hosts: all
+  gather_facts: false
+  tasks:
+    - name: Killing sb exec processes
+      shell: |
+        pgrep -ax sb | grep 'sb exec' | awk '{print $1}' | xargs kill -9 ||:
+      become: yes
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -193,6 +193,10 @@ class SuperBenchRunner():
            )
        )

+    def cleanup(self):    # pragma: no cover
+        """Cleanup remaining processes on all nodes."""
+        self._ansible_client.run(self._ansible_client.get_playbook_config('cleanup.yaml'))
+
    def fetch_results(self):    # pragma: no cover
        """Fetch benchmark results on all nodes."""
        try:
@@ -410,7 +414,7 @@ class SuperBenchRunner():

        if isinstance(timeout, int):
            # we do not expect timeout in ansible unless subprocess hangs
-            ansible_runner_config['timeout'] = timeout + 300
+            ansible_runner_config['timeout'] = timeout + 60

        rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip))
        return rc
@@ -423,16 +427,20 @@ class SuperBenchRunner():
                continue
            benchmark_config = self._sb_benchmarks[benchmark_name]
            for mode in benchmark_config.modes:
+                ansible_rc = 0
                if mode.name == 'local':
-                    Parallel(n_jobs=mode.proc_num if mode.parallel else 1)(
+                    rc_list = Parallel(n_jobs=mode.proc_num if mode.parallel else 1)(
                        delayed(self._run_proc)(benchmark_name, mode, {
                            'proc_rank': proc_rank
                        }) for proc_rank in range(mode.proc_num)
                    )
+                    ansible_rc = sum(rc_list)
                elif mode.name == 'torch.distributed' or mode.name == 'mpi':
-                    self._run_proc(benchmark_name, mode, {'proc_rank': 0})
+                    ansible_rc = self._run_proc(benchmark_name, mode, {'proc_rank': 0})
                else:
                    logger.warning('Unknown mode %s.', mode.name)
+                if ansible_rc != 0:
+                    self.cleanup()
            self.fetch_results()

        self.__create_results_summary()
--- a/tests/analyzer/test_data_diagnosis.py
+++ b/tests/analyzer/test_data_diagnosis.py
@@ -53,9 +53,8 @@ class TestDataDiagnosis(unittest.TestCase):
        test_raw_data_fake = str(self.parent_path / 'test_results_fake.jsonl')
        test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
        diag2 = DataDiagnosis()
-        diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
-        diag2._benchmark_metrics_dict = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df))
-        assert (len(diag2._raw_data_df) == 0)
+        self.assertRaises(FileNotFoundError, file_handler.read_raw_data, test_raw_data_fake)
+        diag2._benchmark_metrics_dict = diag2._get_metrics_by_benchmarks([])
        assert (len(diag2._benchmark_metrics_dict) == 0)
        metric_list = [
            'gpu_temperature', 'gpu_power_limit', 'gemm-flops/FP64',
@@ -68,8 +67,7 @@ class TestDataDiagnosis(unittest.TestCase):
            }
        )
        # Test - read rules
-        rules = file_handler.read_rules(test_rule_file_fake)
-        assert (not rules)
+        self.assertRaises(FileNotFoundError, file_handler.read_rules, test_rule_file_fake)
        rules = file_handler.read_rules(test_rule_file)
        assert (rules)
        # Test - _check_and_format_rules
@@ -131,12 +129,12 @@ class TestDataDiagnosis(unittest.TestCase):
        baseline = file_handler.read_baseline(test_baseline_file)
        assert (diag1._get_baseline_of_metric(baseline, 'kernel-launch/event_overhead:0') == 0.00596)
        assert (diag1._get_baseline_of_metric(baseline, 'kernel-launch/return_code') == 0)
-        assert (diag1._get_baseline_of_metric(baseline, 'mem-bw/H2D:0') == -1)
+        assert (diag1._get_baseline_of_metric(baseline, 'mem-bw/H2D:0') is None)
        # Test - _parse_rules_and_baseline
        # Negative case
-        fake_rules = file_handler.read_rules(test_rule_file_fake)
+        fake_rules = []
        baseline = file_handler.read_baseline(test_baseline_file)
-        assert (diag2._parse_rules_and_baseline(fake_rules, baseline) is False)
+        self.assertRaises(Exception, diag2._parse_rules_and_baseline, fake_rules, baseline)
        diag2 = DataDiagnosis()
        diag2._raw_data_df = file_handler.read_raw_data(test_raw_data)
        diag2._benchmark_metrics_dict = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df))
@@ -146,7 +144,7 @@ class TestDataDiagnosis(unittest.TestCase):
        rules['superbench']['rules']['fake'] = false_rules[0]
        with open(test_rule_file_fake, 'w') as f:
            yaml.dump(rules, f)
-        assert (diag1._parse_rules_and_baseline(fake_rules, baseline) is False)
+        self.assertRaises(Exception, diag1._parse_rules_and_baseline, fake_rules, baseline)
        # Positive case
        rules = file_handler.read_rules(test_rule_file)
        assert (diag1._parse_rules_and_baseline(rules, baseline))
@@ -198,7 +196,7 @@ class TestDataDiagnosis(unittest.TestCase):
            json.loads(line)
            assert ('Category' in line)
            assert ('Defective Details' in line)
-            assert ('Index' in line)
+            assert ('index' in line)
        # Test - generate_md_lines
        lines = diag1.generate_md_lines(data_not_accept_df, diag1._sb_rules, 2)
        assert (lines)
@@ -293,6 +291,38 @@ class TestDataDiagnosis(unittest.TestCase):
            expect_result = f.read()
        assert (data_not_accept_read_from_json == expect_result)

+    def test_data_diagnosis_run_without_baseline(self):
+        """Test for the run process of rule-based data diagnosis."""
+        test_raw_data = str(self.parent_path / 'test_results.jsonl')
+        test_rule_file = str(self.parent_path / 'test_rules_without_baseline.yaml')
+        test_baseline_file = None
+
+        # Test - output in excel
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'excel')
+        assert (Path(self.output_excel_file).is_file())
+
+        # Test - output in json
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json')
+        assert (Path(self.output_json_file).is_file())
+
+        # Test - output in jsonl
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'jsonl')
+        assert (Path(self.output_jsonl_file).is_file())
+
+        # Test - output in md
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'md', round=2)
+        assert (Path(self.output_md_file).is_file())
+
+        # Test - output in html
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'html', round=2)
+        assert (Path(self.output_html_file).is_file())
+
+        # Test - output all nodes results
+        DataDiagnosis().run(
+            test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json', output_all=True
+        )
+        assert (Path(self.output_all_json_file).is_file())
+
    def test_mutli_rules(self):
        """Test multi rules check feature."""
        diag1 = DataDiagnosis()

--- a/tests/analyzer/test_file_handler.py
+++ b/tests/analyzer/test_file_handler.py
@@ -36,16 +36,13 @@ class TestFileHandler(unittest.TestCase):
        # Test - read_raw_data
        raw_data_df = file_handler.read_raw_data(test_raw_data)
        assert (not raw_data_df.empty)
-        raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
-        assert (raw_data_df.empty)
+        self.assertRaises(FileNotFoundError, file_handler.read_raw_data, test_raw_data_fake)
        # Test - read rules
-        rules = file_handler.read_rules(test_rule_file_fake)
-        assert (not rules)
+        self.assertRaises(FileNotFoundError, file_handler.read_rules, test_rule_file_fake)
        rules = file_handler.read_rules(test_rule_file)
        assert (rules)
        # Test - read baseline
-        baseline = file_handler.read_baseline(test_aseline_file_fake)
-        assert (not baseline)
+        self.assertRaises(FileNotFoundError, file_handler.read_baseline, test_aseline_file_fake)
        baseline = file_handler.read_baseline(test_baseline_file)
        assert (baseline)
        # Test - generate_md_table

--- a/tests/analyzer/test_result_summary.py
+++ b/tests/analyzer/test_result_summary.py
@@ -83,8 +83,7 @@ class TestResultSummary(unittest.TestCase):
        # Test - _parse_rules
        # Negative case
        rs2 = ResultSummary()
-        fake_rules = file_handler.read_rules(self.test_rule_file_fake)
-        assert (rs2._parse_rules(fake_rules) is False)
+        self.assertRaises(Exception, file_handler.read_rules, self.test_rule_file_fake)
        rs2._raw_data_df = file_handler.read_raw_data(self.test_raw_data)
        rs2._benchmark_metrics_dict = rs2._get_metrics_by_benchmarks(list(rs2._raw_data_df))
        p = Path(self.test_rule_file)
@@ -93,7 +92,7 @@ class TestResultSummary(unittest.TestCase):
        rules['superbench']['rules']['fake'] = false_rules[0]
        with open(self.test_rule_file_fake, 'w') as f:
            yaml.dump(rules, f)
-        assert (rs1._parse_rules(fake_rules) is False)
+        assert (rs1._parse_rules([]) is False)
        # Positive case
        rules = file_handler.read_rules(self.test_rule_file)
        assert (rs1._parse_rules(rules))

--- a/tests/analyzer/test_rulebase.py
+++ b/tests/analyzer/test_rulebase.py
@@ -28,11 +28,11 @@ class TestRuleBase(unittest.TestCase):
        assert (len(rulebase1._raw_data_df) == 3)
        # Negative case
        test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
+
        test_raw_data_fake = str(self.parent_path / 'test_results_fake.jsonl')
        rulebase2 = RuleBase()
-        rulebase2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
-        rulebase2._benchmark_metrics_dict = rulebase2._get_metrics_by_benchmarks(list(rulebase2._raw_data_df))
-        assert (len(rulebase2._raw_data_df) == 0)
+        self.assertRaises(FileNotFoundError, file_handler.read_raw_data, test_raw_data_fake)
+        rulebase2._benchmark_metrics_dict = rulebase2._get_metrics_by_benchmarks([])
        assert (len(rulebase2._benchmark_metrics_dict) == 0)
        metric_list = [
            'gpu_temperature', 'gpu_power_limit', 'gemm-flops/FP64',
@@ -46,10 +46,8 @@ class TestRuleBase(unittest.TestCase):
        )

        # Test - _preprocess
-        rules = rulebase1._preprocess(test_raw_data_fake, test_rule_file)
-        assert (not rules)
-        rules = rulebase1._preprocess(test_raw_data, test_rule_file_fake)
-        assert (not rules)
+        self.assertRaises(Exception, rulebase1._preprocess, test_raw_data_fake, test_rule_file)
+        self.assertRaises(Exception, rulebase1._preprocess, test_raw_data, test_rule_file_fake)
        rules = rulebase1._preprocess(test_raw_data, test_rule_file)
        assert (rules)


--- a/tests/analyzer/test_ruleop.py
+++ b/tests/analyzer/test_ruleop.py
@@ -65,6 +65,29 @@ class TestRuleOp(unittest.TestCase):
            self.assertRaises(Exception, RuleOp.variance, data_row, rule, summary_data_row, details, categories)
            self.assertRaises(Exception, RuleOp.value, data_row, rule, summary_data_row, details, categories)

+        # Negative case, if baseline is 0 or None in 'variance' function, raise error
+        false_rule_and_baselines = [
+            {
+                'categories': 'KernelLaunch',
+                'criteria': 'lambda x:x>0.5',
+                'function': 'variance',
+                'metrics': {
+                    'kernel-launch/event_overhead:0': 0,
+                }
+            },
+            {
+                'categories': 'KernelLaunch',
+                'criteria': 'lambda x:x>0.5',
+                'function': 'variance',
+                'metrics': {
+                    'kernel-launch/event_overhead:1': None,
+                }
+            },
+        ]
+
+        for rule in false_rule_and_baselines:
+            self.assertRaises(ValueError, RuleOp.variance, data_row, rule, summary_data_row, details, categories)
+
        # Positive case
        true_baselines = [
            {
@@ -132,7 +155,7 @@ class TestRuleOp(unittest.TestCase):
        ]
        label = {}
        for rule in false_baselines:
-            self.assertRaises(Exception, RuleOp.multi_rules, rule, details, categories, label)
+            self.assertRaises(KeyError, RuleOp.multi_rules, false_baselines[0], details, categories, label)

        true_baselines = [
            {

--- a/tests/analyzer/test_rules_without_baseline.yaml
+++ b/tests/analyzer/test_rules_without_baseline.yaml
+# SuperBench rules
+version: v0.6
+superbench:
+  rules:
+    rule0:
+      function: value
+      criteria: lambda x:x>0
+      categories: KernelLaunch
+      metrics:
+        - kernel-launch/event_overhead:\d+
+        - kernel-launch/wall_overhead:\d+
+    rule1:
+      categories: Mem
+      store: True
+      metrics:
+        - mem-bw/H2D_Mem_BW:\d+
+        - mem-bw/D2H_Mem_BW:\d+
+    rule2:
+      function: multi_rules
+      criteria: 'lambda label: bool(min(label["rule1"].values())/max(label["rule1"].values())<0.95)'
+      categories: Mem
--- a/tests/benchmarks/micro_benchmarks/test_cublas_function.py
+++ b/tests/benchmarks/micro_benchmarks/test_cublas_function.py
@@ -50,7 +50,7 @@ def test_cublas_functions():
    context = BenchmarkRegistry.create_benchmark_context(
        'cublas-function',
        platform=Platform.CUDA,
-        parameters='--num_warmup 10 --num_steps 10 --num_in_step 100 --config_json_str ' + custom_config_str
+        parameters=f"--num_warmup 10 --num_steps 10 --num_in_step 100 --config_json_str '{custom_config_str}'"
    )

    assert (BenchmarkRegistry.is_benchmark_context_valid(context))