Docs - Add benchmark metrics for cpu-memory-bw-latency (#264)

**Description** Add benchmark metrics for cpu-memory-bw-latency.

Docs - Add benchmark metrics for cpu-memory-bw-latency (#264)
**Description** Add benchmark metrics for cpu-memory-bw-latency.
10012a0a · Ziyue Yang · GitHub · b6781968 · 10012a0a · 10012a0a
Unverified Commit 10012a0a authored Dec 13, 2021 by Ziyue Yang Committed by GitHub Dec 13, 2021
3 changed files
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -108,6 +108,25 @@ Inference performance of the torchvision models using ONNXRuntime. Currently the

 ## Communication Benchmarks

+### `cpu-memory-bw-latency`
+
+#### Introduction
+
+Measure the memory copy bandwidth and latency across different CPU NUMA nodes.
+performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/articles/tool/intelr-memory-latency-checker.html).
+
+#### Metrics
+
+| Name                                                                    | Unit             | Description                                                         |
+|-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------|
+| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw  | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth.                        |
+| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us)        | Former NUMA to latter NUMA memory latency.                          |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw               | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read.                      |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw       | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
+
 ### `mem-bw`

 #### Introduction

--- a/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py
@@ -90,13 +90,13 @@ def _process_raw_result(self, cmd_idx, raw_output):
            return False
        mlc_test = mlc_test.split(';')[0]
        if 'max_bandwidth' in mlc_test:
-            measure = 'BW'
+            measure = 'bw'
            out_table = self._parse_max_bw(raw_output)
        elif 'bandwidth_matrix' in mlc_test:
-            measure = 'BW'
+            measure = 'bw'
            out_table = self._parse_bw_latency(raw_output)
        elif 'latency_matrix' in mlc_test:
-            measure = 'Latency'
+            measure = 'lat'
            out_table = self._parse_bw_latency(raw_output)
        else:
            logger.error('Invalid option {} to run the {} command'.format(mlc_test, self._commands[cmd_idx]))
@@ -112,9 +112,9 @@ def _process_raw_result(self, cmd_idx, raw_output):
        for key in out_table.keys():
            for index in range(len(out_table[key])):
                if 'max_bandwidth' in mlc_test:
-                    metric = 'Mem_{}_{}_{}'.format(mlc_test, key, measure)
+                    metric = 'mem_{}_{}_{}'.format(mlc_test, key, measure).lower()
                else:
-                    metric = 'Mem_{}_{}_{}_{}'.format(mlc_test, key, str(index), measure)
+                    metric = 'mem_{}_{}_{}_{}'.format(mlc_test, key, str(index), measure).lower()
                self._result.add_result(metric, float(out_table[key][index]))
        return True


--- a/tests/benchmarks/micro_benchmarks/test_cpu_memory_bw_latency_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cpu_memory_bw_latency_performance.py
@@ -94,10 +94,10 @@ def test_cpu_mem_bw_latency_benchmark_result_parsing(self):
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert ('raw_output_0' in benchmark.raw_data)
        assert ([test_raw_output] == benchmark.raw_data['raw_output_0'])
-        assert ([82542.2] == benchmark.result['Mem_bandwidth_matrix_numa_0_0_BW'])
-        assert ([76679.9] == benchmark.result['Mem_bandwidth_matrix_numa_0_1_BW'])
-        assert ([76536.0] == benchmark.result['Mem_bandwidth_matrix_numa_1_0_BW'])
-        assert ([82986.5] == benchmark.result['Mem_bandwidth_matrix_numa_1_1_BW'])
+        assert ([82542.2] == benchmark.result['mem_bandwidth_matrix_numa_0_0_bw'])
+        assert ([76679.9] == benchmark.result['mem_bandwidth_matrix_numa_0_1_bw'])
+        assert ([76536.0] == benchmark.result['mem_bandwidth_matrix_numa_1_0_bw'])
+        assert ([82986.5] == benchmark.result['mem_bandwidth_matrix_numa_1_1_bw'])

        # Positive case - valid latency matrix output.
        test_raw_output = """
@@ -118,10 +118,10 @@ def test_cpu_mem_bw_latency_benchmark_result_parsing(self):
        assert ('raw_output_1' in benchmark.raw_data)
        assert ([test_raw_output] == benchmark.raw_data['raw_output_1'])

-        assert ([87.0] == benchmark.result['Mem_latency_matrix_numa_0_0_Latency'])
-        assert ([101.0] == benchmark.result['Mem_latency_matrix_numa_0_1_Latency'])
-        assert ([101.9] == benchmark.result['Mem_latency_matrix_numa_1_0_Latency'])
-        assert ([86.9] == benchmark.result['Mem_latency_matrix_numa_1_1_Latency'])
+        assert ([87.0] == benchmark.result['mem_latency_matrix_numa_0_0_lat'])
+        assert ([101.0] == benchmark.result['mem_latency_matrix_numa_0_1_lat'])
+        assert ([101.9] == benchmark.result['mem_latency_matrix_numa_1_0_lat'])
+        assert ([86.9] == benchmark.result['mem_latency_matrix_numa_1_1_lat'])

        # Positive case - valid max bandwidth output.
        test_raw_output = """
@@ -148,11 +148,11 @@ def test_cpu_mem_bw_latency_benchmark_result_parsing(self):
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert ('raw_output_2' in benchmark.raw_data)
        assert ([test_raw_output] == benchmark.raw_data['raw_output_2'])
-        assert ([165400.60] == benchmark.result['Mem_max_bandwidth_ALL_Reads_BW'])
-        assert ([154975.19] == benchmark.result['Mem_max_bandwidth_3_1_Reads-Writes_BW'])
-        assert ([158433.32] == benchmark.result['Mem_max_bandwidth_2_1_Reads-Writes_BW'])
-        assert ([157352.05] == benchmark.result['Mem_max_bandwidth_1_1_Reads-Writes_BW'])
-        assert ([157878.32] == benchmark.result['Mem_max_bandwidth_Stream-triad_like_BW'])
+        assert ([165400.60] == benchmark.result['mem_max_bandwidth_all_reads_bw'])
+        assert ([154975.19] == benchmark.result['mem_max_bandwidth_3_1_reads-writes_bw'])
+        assert ([158433.32] == benchmark.result['mem_max_bandwidth_2_1_reads-writes_bw'])
+        assert ([157352.05] == benchmark.result['mem_max_bandwidth_1_1_reads-writes_bw'])
+        assert ([157878.32] == benchmark.result['mem_max_bandwidth_stream-triad_like_bw'])

        # Negative case - invalid raw output.
        assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)