Unverified Commit 63e9b2d1 authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Release - SuperBench v0.6.0 (#409)



**Description**

Cherry-pick bug fixes from v0.6.0 to main.

**Major Revisions**

* Enable latency test in ib traffic validation distributed benchmark (#396)
* Enhance parameter parsing to allow spaces in value (#397)
* Update apt packages in dockerfile (#398)
* Upgrade colorlog for NO_COLOR support (#404)
* Analyzer - Update error handling to support exit code of sb result diagnosis (#403)
* Analyzer - Make baseline file optional in data diagnosis and fix bugs (#399)
* Enhance timeout cleanup to avoid possible hanging (#405)
* Auto generate ibstat file by pssh (#402)
* Analyzer - Format int type and unify empty value to N/A in diagnosis output file (#406)
* Docs - Upgrade version and release note (#407)
* Docs - Fix issues in document (#408)
Co-authored-by: default avatarYang Wang <yangwang1@microsoft.com>
Co-authored-by: default avatarYuting Jiang <yutingjiang@microsoft.com>
parent 733860d7
......@@ -14,7 +14,7 @@
def diagnosis_command_handler(
raw_data_file,
rule_file,
baseline_file,
baseline_file=None,
output_dir=None,
output_file_format='excel',
output_all=False,
......@@ -40,7 +40,8 @@ def diagnosis_command_handler(
raise CLIError('Output format must be in {}.'.format(str(supported_output_format)))
check_argument_file('raw_data_file', raw_data_file)
check_argument_file('rule_file', rule_file)
check_argument_file('baseline_file', baseline_file)
if baseline_file:
check_argument_file('baseline_file', baseline_file)
# Run data diagnosis
DataDiagnosis().run(
raw_data_file, rule_file, baseline_file, sb_output_dir, output_file_format, output_all, decimal_place_value
......
......@@ -8,6 +8,13 @@
import sys
import colorlog
# workaround to get rid of isatty from
# colorama StreamWrapper in WSL2
try:
from colorama import deinit
deinit()
except Exception:
pass
class LoggerAdapter(logging.LoggerAdapter):
......
......@@ -6,6 +6,7 @@
import re
import os
from pathlib import Path
from time import sleep
import networkx as nx
......@@ -35,37 +36,34 @@ def search(self, pattern, string, flags=0):
return self.matched
def gen_ibstat_file(ibstat_file):
"""Generate ibstat file for each node with specified path.
def gen_ibstat_file(host_list, ibstat_file):
"""Generate ibstat file in each node with specified path.
Args:
host_list (list): list of VM read from hostfile.
ibstat_file (str): path of ibstat output.
"""
from mpi4py import MPI
if not MPI.Is_initialized():
MPI.Init()
comm = MPI.COMM_WORLD
name = MPI.Get_processor_name()
# The command to fetch ibstat info
cmd = r"ibstat | grep -Po 'System image GUID: \K\S+$'"
output = os.popen(cmd)
ibstat = 'VM_hostname ' + name + '\n' + str(output.read())
# Fetch all ibstate from each node
ibstats = comm.allgather(ibstat)
ibstate_file_path = Path(ibstat_file)
# Filter the duplicate info
ibstat_infos = set(ibstats)
with ibstate_file_path.open(mode='w') as f:
for ibstat_info in ibstat_infos:
f.write(ibstat_info)
MPI.Finalize()
try:
# Only exec on rank0
if os.environ.get('OMPI_COMM_WORLD_NODE_RANK') == '0' and os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') == '0':
pssh_cmd = "pssh -i -t 5 -p 512 -x '-o StrictHostKeyChecking=no' -H '{}' ".format(' '.join(host_list))
cmd = "'cat /sys/class/infiniband/*/sys_image_guid | tr -d :'" \
r"| sed -e 's/^.*\[SUCCESS\]/VM_hostname/g;s/^.*\[FAILURE\]/VM_hostname/g' | cut -d ' ' -f 1,2"
output = os.popen(pssh_cmd + cmd).read()
# Generate ibstat file
ibstate_file_path = Path(ibstat_file)
with ibstate_file_path.open(mode='w') as f:
f.write(output)
scp_cmd = "pscp -t 5 -p 512 -H '{0}' {1} {1}".format(' '.join(host_list), ibstat_file)
# Distribute ibstat file for others
errorn = os.system(scp_cmd)
if errorn != 0:
logger.error('Failed to distribute ibstate file')
else:
# Wait for rank0 done
sleep(5)
except BaseException as e:
logger.error('Failed to generate ibstate file, message: {}.'.format(str(e)))
def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist, max_dist): # noqa: C901
......@@ -91,7 +89,9 @@ def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist,
if not ibstat_file:
ibstat_file = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'ib_traffic_topo_aware_ibstat.txt')
gen_ibstat_file(ibstat_file)
gen_ibstat_file(host_list, ibstat_file)
# sync all the rank
sleep(5)
if not Path(ibstat_file).exists():
logger.error('ibstat file does not exist.')
......@@ -125,8 +125,8 @@ def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist,
r = quick_regexp()
if r.search(r'^(VM_hostname)\s+(.+)', line):
vmhost = r.groups[1]
elif r.search(r'^(0x)(.+)', line):
sysimgguid = r.groups[1]
elif r.search(r'^(?!0{16})([a-f0-9]{16})$', line):
sysimgguid = r.groups[0]
sysimgguid_to_vmhost[sysimgguid] = vmhost
except BaseException as e:
logger.error('Failed to read ibstate file, message: {}.'.format(str(e)))
......
......@@ -3,7 +3,7 @@
# Server:
# - Product: HPE Apollo 6500
version: v0.5
version: v0.6
superbench:
enable: null
var:
......
......@@ -4,7 +4,7 @@
# - Product: G482-Z53
# - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html
version: v0.5
version: v0.6
superbench:
enable: null
var:
......
version: v0.5
version: v0.6
superbench:
enable: null
monitor:
......
version: v0.5
version: v0.6
superbench:
enable: null
monitor:
......
version: v0.5
version: v0.6
superbench:
enable: null
monitor:
......
......@@ -3,7 +3,7 @@
# Azure NDm A100 v4
# reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series
version: v0.5
version: v0.6
superbench:
enable: null
monitor:
......
# SuperBench Config
version: v0.5
version: v0.6
superbench:
enable: null
monitor:
......
# SuperBench Config
version: v0.5
version: v0.6
superbench:
enable: null
monitor:
......
- name: Runtime Environment Cleanup
hosts: all
gather_facts: false
tasks:
- name: Killing sb exec processes
shell: |
pgrep -ax sb | grep 'sb exec' | awk '{print $1}' | xargs kill -9 ||:
become: yes
......@@ -193,6 +193,10 @@ def check_env(self): # pragma: no cover
)
)
def cleanup(self): # pragma: no cover
"""Cleanup remaining processes on all nodes."""
self._ansible_client.run(self._ansible_client.get_playbook_config('cleanup.yaml'))
def fetch_results(self): # pragma: no cover
"""Fetch benchmark results on all nodes."""
try:
......@@ -410,7 +414,7 @@ def _run_proc(self, benchmark_name, mode, vars):
if isinstance(timeout, int):
# we do not expect timeout in ansible unless subprocess hangs
ansible_runner_config['timeout'] = timeout + 300
ansible_runner_config['timeout'] = timeout + 60
rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip))
return rc
......@@ -423,16 +427,20 @@ def run(self):
continue
benchmark_config = self._sb_benchmarks[benchmark_name]
for mode in benchmark_config.modes:
ansible_rc = 0
if mode.name == 'local':
Parallel(n_jobs=mode.proc_num if mode.parallel else 1)(
rc_list = Parallel(n_jobs=mode.proc_num if mode.parallel else 1)(
delayed(self._run_proc)(benchmark_name, mode, {
'proc_rank': proc_rank
}) for proc_rank in range(mode.proc_num)
)
ansible_rc = sum(rc_list)
elif mode.name == 'torch.distributed' or mode.name == 'mpi':
self._run_proc(benchmark_name, mode, {'proc_rank': 0})
ansible_rc = self._run_proc(benchmark_name, mode, {'proc_rank': 0})
else:
logger.warning('Unknown mode %s.', mode.name)
if ansible_rc != 0:
self.cleanup()
self.fetch_results()
self.__create_results_summary()
......@@ -53,9 +53,8 @@ def test_data_diagnosis(self):
test_raw_data_fake = str(self.parent_path / 'test_results_fake.jsonl')
test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
diag2 = DataDiagnosis()
diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
diag2._benchmark_metrics_dict = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df))
assert (len(diag2._raw_data_df) == 0)
self.assertRaises(FileNotFoundError, file_handler.read_raw_data, test_raw_data_fake)
diag2._benchmark_metrics_dict = diag2._get_metrics_by_benchmarks([])
assert (len(diag2._benchmark_metrics_dict) == 0)
metric_list = [
'gpu_temperature', 'gpu_power_limit', 'gemm-flops/FP64',
......@@ -68,8 +67,7 @@ def test_data_diagnosis(self):
}
)
# Test - read rules
rules = file_handler.read_rules(test_rule_file_fake)
assert (not rules)
self.assertRaises(FileNotFoundError, file_handler.read_rules, test_rule_file_fake)
rules = file_handler.read_rules(test_rule_file)
assert (rules)
# Test - _check_and_format_rules
......@@ -131,12 +129,12 @@ def test_data_diagnosis(self):
baseline = file_handler.read_baseline(test_baseline_file)
assert (diag1._get_baseline_of_metric(baseline, 'kernel-launch/event_overhead:0') == 0.00596)
assert (diag1._get_baseline_of_metric(baseline, 'kernel-launch/return_code') == 0)
assert (diag1._get_baseline_of_metric(baseline, 'mem-bw/H2D:0') == -1)
assert (diag1._get_baseline_of_metric(baseline, 'mem-bw/H2D:0') is None)
# Test - _parse_rules_and_baseline
# Negative case
fake_rules = file_handler.read_rules(test_rule_file_fake)
fake_rules = []
baseline = file_handler.read_baseline(test_baseline_file)
assert (diag2._parse_rules_and_baseline(fake_rules, baseline) is False)
self.assertRaises(Exception, diag2._parse_rules_and_baseline, fake_rules, baseline)
diag2 = DataDiagnosis()
diag2._raw_data_df = file_handler.read_raw_data(test_raw_data)
diag2._benchmark_metrics_dict = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df))
......@@ -146,7 +144,7 @@ def test_data_diagnosis(self):
rules['superbench']['rules']['fake'] = false_rules[0]
with open(test_rule_file_fake, 'w') as f:
yaml.dump(rules, f)
assert (diag1._parse_rules_and_baseline(fake_rules, baseline) is False)
self.assertRaises(Exception, diag1._parse_rules_and_baseline, fake_rules, baseline)
# Positive case
rules = file_handler.read_rules(test_rule_file)
assert (diag1._parse_rules_and_baseline(rules, baseline))
......@@ -198,7 +196,7 @@ def test_data_diagnosis(self):
json.loads(line)
assert ('Category' in line)
assert ('Defective Details' in line)
assert ('Index' in line)
assert ('index' in line)
# Test - generate_md_lines
lines = diag1.generate_md_lines(data_not_accept_df, diag1._sb_rules, 2)
assert (lines)
......@@ -293,6 +291,38 @@ def test_data_diagnosis_run(self):
expect_result = f.read()
assert (data_not_accept_read_from_json == expect_result)
def test_data_diagnosis_run_without_baseline(self):
"""Test for the run process of rule-based data diagnosis."""
test_raw_data = str(self.parent_path / 'test_results.jsonl')
test_rule_file = str(self.parent_path / 'test_rules_without_baseline.yaml')
test_baseline_file = None
# Test - output in excel
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'excel')
assert (Path(self.output_excel_file).is_file())
# Test - output in json
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json')
assert (Path(self.output_json_file).is_file())
# Test - output in jsonl
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'jsonl')
assert (Path(self.output_jsonl_file).is_file())
# Test - output in md
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'md', round=2)
assert (Path(self.output_md_file).is_file())
# Test - output in html
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'html', round=2)
assert (Path(self.output_html_file).is_file())
# Test - output all nodes results
DataDiagnosis().run(
test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json', output_all=True
)
assert (Path(self.output_all_json_file).is_file())
def test_mutli_rules(self):
"""Test multi rules check feature."""
diag1 = DataDiagnosis()
......
......@@ -36,16 +36,13 @@ def test_file_handler(self):
# Test - read_raw_data
raw_data_df = file_handler.read_raw_data(test_raw_data)
assert (not raw_data_df.empty)
raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
assert (raw_data_df.empty)
self.assertRaises(FileNotFoundError, file_handler.read_raw_data, test_raw_data_fake)
# Test - read rules
rules = file_handler.read_rules(test_rule_file_fake)
assert (not rules)
self.assertRaises(FileNotFoundError, file_handler.read_rules, test_rule_file_fake)
rules = file_handler.read_rules(test_rule_file)
assert (rules)
# Test - read baseline
baseline = file_handler.read_baseline(test_aseline_file_fake)
assert (not baseline)
self.assertRaises(FileNotFoundError, file_handler.read_baseline, test_aseline_file_fake)
baseline = file_handler.read_baseline(test_baseline_file)
assert (baseline)
# Test - generate_md_table
......
......@@ -83,8 +83,7 @@ def test_result_summary(self):
# Test - _parse_rules
# Negative case
rs2 = ResultSummary()
fake_rules = file_handler.read_rules(self.test_rule_file_fake)
assert (rs2._parse_rules(fake_rules) is False)
self.assertRaises(Exception, file_handler.read_rules, self.test_rule_file_fake)
rs2._raw_data_df = file_handler.read_raw_data(self.test_raw_data)
rs2._benchmark_metrics_dict = rs2._get_metrics_by_benchmarks(list(rs2._raw_data_df))
p = Path(self.test_rule_file)
......@@ -93,7 +92,7 @@ def test_result_summary(self):
rules['superbench']['rules']['fake'] = false_rules[0]
with open(self.test_rule_file_fake, 'w') as f:
yaml.dump(rules, f)
assert (rs1._parse_rules(fake_rules) is False)
assert (rs1._parse_rules([]) is False)
# Positive case
rules = file_handler.read_rules(self.test_rule_file)
assert (rs1._parse_rules(rules))
......
......@@ -28,11 +28,11 @@ def test_rule_base(self):
assert (len(rulebase1._raw_data_df) == 3)
# Negative case
test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
test_raw_data_fake = str(self.parent_path / 'test_results_fake.jsonl')
rulebase2 = RuleBase()
rulebase2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
rulebase2._benchmark_metrics_dict = rulebase2._get_metrics_by_benchmarks(list(rulebase2._raw_data_df))
assert (len(rulebase2._raw_data_df) == 0)
self.assertRaises(FileNotFoundError, file_handler.read_raw_data, test_raw_data_fake)
rulebase2._benchmark_metrics_dict = rulebase2._get_metrics_by_benchmarks([])
assert (len(rulebase2._benchmark_metrics_dict) == 0)
metric_list = [
'gpu_temperature', 'gpu_power_limit', 'gemm-flops/FP64',
......@@ -46,10 +46,8 @@ def test_rule_base(self):
)
# Test - _preprocess
rules = rulebase1._preprocess(test_raw_data_fake, test_rule_file)
assert (not rules)
rules = rulebase1._preprocess(test_raw_data, test_rule_file_fake)
assert (not rules)
self.assertRaises(Exception, rulebase1._preprocess, test_raw_data_fake, test_rule_file)
self.assertRaises(Exception, rulebase1._preprocess, test_raw_data, test_rule_file_fake)
rules = rulebase1._preprocess(test_raw_data, test_rule_file)
assert (rules)
......
......@@ -65,6 +65,29 @@ def test_rule_op(self):
self.assertRaises(Exception, RuleOp.variance, data_row, rule, summary_data_row, details, categories)
self.assertRaises(Exception, RuleOp.value, data_row, rule, summary_data_row, details, categories)
# Negative case, if baseline is 0 or None in 'variance' function, raise error
false_rule_and_baselines = [
{
'categories': 'KernelLaunch',
'criteria': 'lambda x:x>0.5',
'function': 'variance',
'metrics': {
'kernel-launch/event_overhead:0': 0,
}
},
{
'categories': 'KernelLaunch',
'criteria': 'lambda x:x>0.5',
'function': 'variance',
'metrics': {
'kernel-launch/event_overhead:1': None,
}
},
]
for rule in false_rule_and_baselines:
self.assertRaises(ValueError, RuleOp.variance, data_row, rule, summary_data_row, details, categories)
# Positive case
true_baselines = [
{
......@@ -132,7 +155,7 @@ def test_multi_rules_op(self):
]
label = {}
for rule in false_baselines:
self.assertRaises(Exception, RuleOp.multi_rules, rule, details, categories, label)
self.assertRaises(KeyError, RuleOp.multi_rules, false_baselines[0], details, categories, label)
true_baselines = [
{
......
# SuperBench rules
version: v0.6
superbench:
rules:
rule0:
function: value
criteria: lambda x:x>0
categories: KernelLaunch
metrics:
- kernel-launch/event_overhead:\d+
- kernel-launch/wall_overhead:\d+
rule1:
categories: Mem
store: True
metrics:
- mem-bw/H2D_Mem_BW:\d+
- mem-bw/D2H_Mem_BW:\d+
rule2:
function: multi_rules
criteria: 'lambda label: bool(min(label["rule1"].values())/max(label["rule1"].values())<0.95)'
categories: Mem
......@@ -50,7 +50,7 @@ def test_cublas_functions():
context = BenchmarkRegistry.create_benchmark_context(
'cublas-function',
platform=Platform.CUDA,
parameters='--num_warmup 10 --num_steps 10 --num_in_step 100 --config_json_str ' + custom_config_str
parameters=f"--num_warmup 10 --num_steps 10 --num_in_step 100 --config_json_str '{custom_config_str}'"
)
assert (BenchmarkRegistry.is_benchmark_context_valid(context))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment