Benchmarks: micro benchmarks - add python code for DirecXGPUMemBw (#547)

**Description** add python code for DirecXGPUMemBw.

Benchmarks: micro benchmarks - add python code for DirecXGPUMemBw (#547)
**Description** add python code for DirecXGPUMemBw.
af4cfd5b · Yuting Jiang · GitHub · f1d608ae · af4cfd5b · af4cfd5b
Unverified Commit af4cfd5b authored Jul 05, 2023 by Yuting Jiang Committed by GitHub Jul 05, 2023
8 changed files
--- a/.github/workflows/build-win.yml
+++ b/.github/workflows/build-win.yml
@@ -23,6 +23,7 @@ jobs:
      run: |
        docker system prune -a -f
        docker volume prune -a -f
+      shell: pwsh
    - name: Build Docker image
      working-directory: .
      shell: pwsh

--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import Ro
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
 from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
 from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops

 __all__ = [
@@ -62,5 +63,6 @@ __all__ = [
    'ShardingMatmul',
    'TCPConnectivityBenchmark',
    'TensorRTInferenceBenchmark',
+    'DirectXGPUMemBw',
    'DirectXGPUCoreFlops',
 ]
--- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DirectXGPUMemBw performance benchmarks."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class DirectXGPUMemBw(MicroBenchmarkWithInvoke):
+    """The DirectXGPUMemBw benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'DirectXGPUMemRwBw.exe'
+        self._modes = ['read', 'write', 'readwrite']
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--num_warm_up',
+            type=int,
+            default=0,
+            required=False,
+            help='Number of warm up rounds.',
+        )
+        self._parser.add_argument(
+            '--num_loop',
+            type=int,
+            default=100,
+            required=False,
+            help='Number of loop times to measure the performance.',
+        )
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=None,
+            required=False,
+            help='Size of data for GPU copy.',
+        )
+        self._parser.add_argument(
+            '--minbytes',
+            type=int,
+            default=4096,
+            required=False,
+            help='Lower data size bound to test.',
+        )
+        self._parser.add_argument(
+            '--maxbytes',
+            type=int,
+            default=1024 * 1024 * 1024,
+            required=False,
+            help='Upper data size bound to test.',
+        )
+        self._parser.add_argument(
+            '--check_data',
+            action='store_true',
+            required=False,
+            help='Whether check data correctness.',
+        )
+        self._parser.add_argument(
+            '--mode',
+            type=str,
+            nargs='+',
+            default=list(),
+            help='Memory operation mode. E.g. {}.'.format(' '.join(self._modes)),
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking."""
+        if not super()._preprocess():
+            return False
+
+        self._args.mode = [m.lower() for m in self._args.mode]
+        for mode in self._args.mode:
+            if mode not in self._modes:
+                logger.warning(
+                    'Unsupported mode - benchmark: {}, mode: {}, expected: {}.'.format(self._name, mode, self._modes)
+                )
+                self._args.mode.remove(mode)
+
+        if len(self._args.mode) == 0:
+            logger.error('No valid operation modes are provided.')
+            return False
+
+        for mode in self._args.mode:
+            command = os.path.join(self._args.bin_dir, self._bin_name)
+            command += (' --num_warm_up ' + str(self._args.num_warm_up))
+            command += (' --num_loop ' + str(self._args.num_loop))
+            if self._args.size is not None:
+                command += (' --size ' + str(self._args.size))
+            else:
+                command += (' --minbytes ' + str(self._args.minbytes))
+                command += (' --maxbytes ' + str(self._args.maxbytes))
+            if self._args.check_data:
+                command += (' --check_data')
+            command += (' --' + mode)
+            self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to process raw results and save the summarized results.
+
+        self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        mode = self._args.mode[cmd_idx]
+        self._result.add_raw_data('raw_output_' + mode, raw_output, self._args.log_raw_data)
+
+        valid = True
+
+        content = raw_output.splitlines()
+        try:
+            for line in content:
+                if 'GPUMemBw:' in line:
+                    size = int(line.split()[-3])
+                    bw = float(line.split()[-2])
+                    self._result.add_result(f'{mode}_{size}_bw', bw)
+                if 'error' in line.lower():
+                    valid = False
+        except BaseException:
+            valid = False
+        finally:
+            if not valid:
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+        return True
+
+
+BenchmarkRegistry.register_benchmark('directx-gpu-mem-bw', DirectXGPUMemBw, platform=Platform.DIRECTX)
--- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
@@ -68,7 +68,7 @@ class BenchmarkOptions : public Options {
        min_size = get_cmd_line_argument_int("--minbytes", 4 * 1024);
        max_size =
            get_cmd_line_argument_ulonglong("--maxbytes", static_cast<unsigned long long>(1LL * 1024 * 1024 * 1024));
-        check_data = get_cmd_line_argument_bool("--check");
+        check_data = get_cmd_line_argument_bool("--check_data");
        if (get_cmd_line_argument_bool("--read")) {
            mem_type = Memtype::Read;
        }

--- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
@@ -19,12 +19,14 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <TargetName>DirectXGPUMemRwBw</TargetName>
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <TargetName>DirectXGPUMemRwBw</TargetName>
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>

--- a/superbench/benchmarks/micro_benchmarks/micro_base.py
+++ b/superbench/benchmarks/micro_benchmarks/micro_base.py
@@ -180,7 +180,7 @@ class MicroBenchmarkWithInvoke(MicroBenchmark):
                )
            )

-            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
+            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing, cwd=self._args.bin_dir)
            if output.returncode != 0:
                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                logger.error(

--- a/superbench/common/utils/process.py
+++ b/superbench/common/utils/process.py
@@ -10,13 +10,14 @@ import shlex
 from superbench.common.utils import stdout_logger


-def run_command(command, quiet=False, flush_output=False):
+def run_command(command, quiet=False, flush_output=False, cwd=None):
    """Run command in string format, return the result with stdout and stderr.

    Args:
        command (str): command to run.
        quiet (bool): no stdout display of the command if quiet is True.
        flush_output (bool): enable real-time output flush or not when running the command.
+        cwd (str): working directory to run the command.

    Return:
        result (subprocess.CompletedProcess): The return value from subprocess.run().
@@ -26,7 +27,11 @@ def run_command(command, quiet=False, flush_output=False):
        try:
            args = shlex.split(command)
            process = subprocess.Popen(
-                args, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True
+                args,
+                cwd=os.getcwd() if cwd is None else cwd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True
            )
            output = ''
            for line in process.stdout:
@@ -43,7 +48,13 @@ def run_command(command, quiet=False, flush_output=False):
            return subprocess.CompletedProcess(args=args, returncode=-1, stdout=str(e))
    else:
        result = subprocess.run(
-            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+            command,
+            cwd=os.getcwd() if cwd is None else cwd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            shell=True,
+            check=False,
+            universal_newlines=True
        )
        if not quiet:
            stdout_logger.log(result.stdout)

--- a/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DirectXGPUMemBw benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+@decorator.directx_test
+def test_directx_gpu_mem_bw():
+    """Test DirectXGPUMemBw benchmark."""
+    # Test for default configuration
+    context = BenchmarkRegistry.create_benchmark_context(
+        'directx-gpu-mem-bw',
+        platform=Platform.DIRECTX,
+        parameters=r'--num_warm_up 0 --num_loop 100 --size 1073741824 --mode read write'
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (benchmark.name == 'directx-gpu-mem-bw')
+    assert (benchmark.type == BenchmarkType.MICRO)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.num_warm_up == 0)
+    assert (benchmark._args.num_loop == 100)
+    assert (benchmark._args.size == 1073741824)
+    assert (sorted(benchmark._args.mode) == ['read', 'write'])
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    assert ('raw_output_read' in benchmark.raw_data)
+    assert ('raw_output_write' in benchmark.raw_data)
+    assert (len(benchmark.raw_data['raw_output_read']) == 1)
+    assert (len(benchmark.raw_data['raw_output_write']) == 1)
+    assert (isinstance(benchmark.raw_data['raw_output_read'][0], str))
+    assert (isinstance(benchmark.raw_data['raw_output_write'][0], str))
+
+    assert ('read_1073741824_bw' in benchmark.result)
+    assert ('write_1073741824_bw' in benchmark.result)
+    assert (len(benchmark.result['read_1073741824_bw']) == 1)
+    assert (len(benchmark.result['write_1073741824_bw']) == 1)
+    assert (isinstance(benchmark.result['read_1073741824_bw'][0], numbers.Number))
+    assert (isinstance(benchmark.result['write_1073741824_bw'][0], numbers.Number))