Unverified Commit ff563b66 authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Release - SuperBench v0.4.0 (#278)



__Description__

Cherry-pick  bug fixes from v0.4.0 to main.

__Major Revisions__

* Bug - Fix issues for Ansible and benchmarks (#267)
* Tests - Refine test cases for microbenchmark (#268)
* Bug - Build openmpi with ucx support in rocm dockerfiles (#269)
* Benchmarks: Fix Bug - Fix fio build issue (#272)
* Docs - Unify metric and add doc for cublas and cudnn functions (#271)
* Monitor: Revision - Add 'monitor/' prefix to monitor metrics in result summary (#274)
* Bug - Fix bug of detecting if gpu_index is none (#275)
* Bug - Fix bugs in data diagnosis (#273)
* Bug - Fix issue that the root mpi rank may not be the first in the hostfile (#270)
* Benchmarks: Configuration - Update inference and network benchmarks in configs (#276)
* Docs - Upgrade version and release note (#277)
Co-authored-by: default avatarYuting Jiang <v-yutjiang@microsoft.com>
parent 682ed06a
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Unittest TestCase helpers."""
import os
import shutil
import tempfile
from pathlib import Path
class BenchmarkTestCase(object):
"""Base class for benchmark test case.
Examples:
Inherit from both BenchmarkTestCase and unittest.TestCase.
```
def FooBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase):
def setUp(self):
super().setUp()
...
```
"""
def setUp(self):
"""Hook method for setting up the test fixture before exercising it."""
pass
def tearDown(self):
"""Hook method for deconstructing the test fixture after testing it."""
pass
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class.
Will create a temp directory and mock envs for all tests.
Run once for the whole class.
"""
cls._tmp_dir = tempfile.mkdtemp(prefix='sbtest')
cls._curr_mock_envs = {}
@classmethod
def tearDownClass(cls):
"""Hook method for deconstructing the class fixture after running all tests in the class.
Will restore original envs and cleanup temp directory.
Run once for the whole class.
"""
cls.cleanupMockEnvs(cls)
shutil.rmtree(cls._tmp_dir)
def createMockEnvs(self, envs=None):
"""Create mock envs for tests.
Args:
envs (dict, optional): Environment variables to be mocked.
Defaults to None and will mock SB_MICRO_PATH to temp directory.
"""
if not envs:
envs = {'SB_MICRO_PATH': self._tmp_dir}
for name in envs:
self._curr_mock_envs[name] = os.environ.get(name, None)
os.environ[name] = envs[name]
def cleanupMockEnvs(self):
"""Cleanup mock envs and restore original envs."""
for name in self._curr_mock_envs:
if self._curr_mock_envs[name] is None:
del os.environ[name]
else:
os.environ[name] = self._curr_mock_envs[name]
def createMockFiles(self, files, mode=0o755):
"""Create mock files for tests.
Args:
files (List[str]): List of file names, relative path will be created under temp directory.
mode (int, optional): Octal integer for file mode. Defaults to 0o755.
"""
for filename in files:
filepath = Path(self._tmp_dir) / filename
filepath.parent.mkdir(parents=True, exist_ok=True)
filepath.touch(mode=mode, exist_ok=True)
......@@ -38,16 +38,17 @@ def setUp(self):
'host_password': 'pass',
})
)
_, self.test_mpi_host_file = tempfile.mkstemp()
def tearDown(self):
"""Hook method for deconstructing the test fixture after testing it."""
Path(self.host_file).unlink()
Path(self.test_mpi_host_file).unlink()
def test_init_config(self):
"""Test initial config of client."""
self.assertDictEqual(
self.ansible_client._config, {
'private_data_dir': None,
'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': {
......@@ -62,6 +63,63 @@ def test_update_mpi_config(self):
self.assertDictEqual(
self.ansible_client.update_mpi_config(self.ansible_client._config), {
**self.ansible_client._config,
'host_pattern': '10.0.0.10',
}
)
def test_update_mpi_config_for_different_inventory(self):
"""Test update_mpi_config of client for different inventory."""
# Test for out-of-order
with open(self.test_mpi_host_file, 'w') as fd:
fd.write('all:\n hosts:\n 10.0.0.12:\n 10.0.0.11:\n 10.0.0.10:\n 10.0.0.13:\n 10.0.0.14:\n')
mess_hosts = AnsibleClient(
OmegaConf.create(
{
'host_file': self.test_mpi_host_file,
'host_username': 'user',
'host_password': 'pass',
}
)
)
self.assertDictEqual(
mess_hosts.update_mpi_config(mess_hosts._config), {
**mess_hosts._config,
'host_pattern': '10.0.0.10',
}
)
# Test for localhost
with open(self.test_mpi_host_file, 'w') as fd:
fd.write('all:\n hosts:\n localhost:\n')
localhost = AnsibleClient(
OmegaConf.create(
{
'host_file': self.test_mpi_host_file,
'host_username': 'user',
'host_password': 'pass',
}
)
)
self.assertDictEqual(
localhost.update_mpi_config(localhost._config), {
**localhost._config,
'host_pattern': 'localhost',
}
)
# Test for no host
with open(self.test_mpi_host_file, 'w') as fd:
fd.write('all:\n hosts:\n')
no_hosts = AnsibleClient(
OmegaConf.create(
{
'host_file': self.test_mpi_host_file,
'host_username': 'user',
'host_password': 'pass',
}
)
)
self.assertDictEqual(
no_hosts.update_mpi_config(no_hosts._config), {
**no_hosts._config,
'host_pattern': 'all[0]',
}
)
......@@ -71,7 +129,6 @@ def test_get_shell_config(self):
cmd = 'ls -la'
self.assertDictEqual(
self.ansible_client.get_shell_config(cmd), {
'private_data_dir': None,
'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': {
......@@ -87,7 +144,6 @@ def test_get_playbook_config(self):
"""Test get_playbook_config of client."""
self.assertDictEqual(
self.ansible_client.get_playbook_config('play', {'foo': 'bar'}), {
'private_data_dir': None,
'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': {
......
......@@ -244,37 +244,37 @@ def test_merge_monitor_metrics(self):
"""Test __merge_monitor_metrics."""
path = Path('tests/data/monitor/')
expected = {
'gpu_temperature:0': 50,
'gpu_temperature:1': 27,
'gpu_temperature:2': 24,
'gpu_temperature:3': 26,
'gpu_temperature:4': 25,
'gpu_temperature:5': 25,
'gpu_temperature:6': 23,
'gpu_temperature:7': 26,
'gpu_power_limit:0': 250,
'gpu_power_limit:1': 200,
'gpu_power_limit:2': 250,
'gpu_power_limit:3': 250,
'gpu_power_limit:4': 250,
'gpu_power_limit:5': 250,
'gpu_power_limit:6': 250,
'gpu_power_limit:7': 250,
'gpu_corrected_ecc:0': 12,
'gpu_corrected_ecc:1': 0,
'gpu_corrected_ecc:2': 0,
'gpu_corrected_ecc:3': 0,
'gpu_corrected_ecc:4': 0,
'gpu_corrected_ecc:5': 0,
'gpu_corrected_ecc:6': 0,
'gpu_corrected_ecc:7': 0,
'gpu_uncorrected_ecc:0': 0,
'gpu_uncorrected_ecc:1': 0,
'gpu_uncorrected_ecc:2': 0,
'gpu_uncorrected_ecc:3': 0,
'gpu_uncorrected_ecc:4': 0,
'gpu_uncorrected_ecc:5': 0,
'gpu_uncorrected_ecc:6': 0,
'gpu_uncorrected_ecc:7': 0
'monitor/gpu_temperature:0': 50,
'monitor/gpu_temperature:1': 27,
'monitor/gpu_temperature:2': 24,
'monitor/gpu_temperature:3': 26,
'monitor/gpu_temperature:4': 25,
'monitor/gpu_temperature:5': 25,
'monitor/gpu_temperature:6': 23,
'monitor/gpu_temperature:7': 26,
'monitor/gpu_power_limit:0': 250,
'monitor/gpu_power_limit:1': 200,
'monitor/gpu_power_limit:2': 250,
'monitor/gpu_power_limit:3': 250,
'monitor/gpu_power_limit:4': 250,
'monitor/gpu_power_limit:5': 250,
'monitor/gpu_power_limit:6': 250,
'monitor/gpu_power_limit:7': 250,
'monitor/gpu_corrected_ecc:0': 12,
'monitor/gpu_corrected_ecc:1': 0,
'monitor/gpu_corrected_ecc:2': 0,
'monitor/gpu_corrected_ecc:3': 0,
'monitor/gpu_corrected_ecc:4': 0,
'monitor/gpu_corrected_ecc:5': 0,
'monitor/gpu_corrected_ecc:6': 0,
'monitor/gpu_corrected_ecc:7': 0,
'monitor/gpu_uncorrected_ecc:0': 0,
'monitor/gpu_uncorrected_ecc:1': 0,
'monitor/gpu_uncorrected_ecc:2': 0,
'monitor/gpu_uncorrected_ecc:3': 0,
'monitor/gpu_uncorrected_ecc:4': 0,
'monitor/gpu_uncorrected_ecc:5': 0,
'monitor/gpu_uncorrected_ecc:6': 0,
'monitor/gpu_uncorrected_ecc:7': 0
}
self.assertEqual(self.runner._SuperBenchRunner__merge_monitor_metrics(path), expected)
......@@ -63,7 +63,7 @@ endif
# Build FIO from commit d83ac9 (fio-3.28 tag).
fio:
ifneq (,$(wildcard fio/Makefile))
cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) && make -j && make install
cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install
endif
# Build rccl-tests from commit dc1ad48 of develop branch (default branch).
......
---
slug: release-sb-v0.4
title: Releasing SuperBench v0.4
author: Peng Cheng
author_title: SuperBench Team
author_url: https://github.com/cp5555
author_image_url: https://github.com/cp5555.png
tags: [superbench, announcement, release]
---
We are very happy to announce that **SuperBench 0.4.0 version** is officially released today!
You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation).
## SuperBench 0.4.0 Release Notes
### SuperBench Framework
#### Monitor
- Add monitor framework for NVIDIA GPU, CPU, memory and disk.
#### Data Diagnosis and Analysis
- Support baseline-based data diagnosis.
- Support basic analysis feature (boxplot figure, outlier detection, etc.).
### Single-node Validation
#### Micro Benchmarks
- CPU Memory Validation (tool: Intel Memory Latency Checker).
- GPU Copy Bandwidth (tool: built by MSRA).
- Add ORT Model on AMD GPU platform.
- Add inference backend TensorRT.
- Add inference backend ORT.
### Multi-node Validation
#### Micro Benchmarks
- IB Networking validation.
- TCP validation (tool: TCPing).
- GPCNet Validation (tool: GPCNet).
### Other Improvement
1. Enhancement
- Add pipeline for AMD docker.
- Integrate system config info script with SuperBench.
- Support FP32 mode without TF32.
- Refine unit test for microbenchmark.
- Unify metric names for all benchmarks.
2. Document
- Add benchmark list
- Add monitor document
- Add data diagnosis document
......@@ -101,7 +101,7 @@ module.exports = {
announcementBar: {
id: 'supportus',
content:
'📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.3">v0.3.0</a> has been released! ' +
'📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.4">v0.4.0</a> has been released! ' +
'⭐️ If you like SuperBench, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/microsoft/superbenchmark">GitHub</a>! ⭐️',
},
algolia: {
......
{
"name": "superbench-website",
"version": "0.3.0",
"version": "0.4.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
......
{
"name": "superbench-website",
"version": "0.3.0",
"version": "0.4.0",
"private": true,
"scripts": {
"docusaurus": "docusaurus",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment