Benchmarks: micro benchmarks - add general CPU bandwidth and latency benchmark (#662)

**Description** Add micro benchmark to measure general CPU bandwidth and latency without 'mlc'. Test output: ``` { "cpu-memory-bw-latency/return_code": 0, "cpu-memory-bw-latency/mem_bandwidth_matrix_numa_0_1_bw": 5388.75021, "cpu-memory-bw-latency/mem_bandwidth_matrix_numa_0_1_lat": 0.185571786, "cpu-memory-bw-latency/mem_bandwidth_matrix_numa_1_0_bw": 4634.82028, "cpu-memory-bw-latency/mem_bandwidth_matrix_numa_1_0_lat": 0.215758096, } ``` --------- Co-authored-by: hongtaozhang <hongtaozhang@microsoft.com>

Benchmarks: micro benchmarks - add general CPU bandwidth and latency benchmark (#662)
**Description** Add micro benchmark to measure general CPU bandwidth and latency without 'mlc'. Test output: ``` { "cpu-memory-bw-latency/return_code": 0, "cpu-memory-bw-latency/mem_bandwidth_matrix_numa_0_1_bw": 5388.75021, "cpu-memory-bw-latency/mem_bandwidth_matrix_numa_0_1_lat": 0.185571786, "cpu-memory-bw-latency/mem_bandwidth_matrix_numa_1_0_bw": 4634.82028, "cpu-memory-bw-latency/mem_bandwidth_matrix_numa_1_0_lat": 0.215758096, } ``` --------- Co-authored-by: hongtaozhang <hongtaozhang@microsoft.com>
9c35e80a · Hongtao Zhang · GitHub · a8a7bed2 · 9c35e80a · 9c35e80a
Unverified Commit 9c35e80a authored Nov 20, 2024 by Hongtao Zhang Committed by GitHub Nov 20, 2024
4 changed files
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.18)
+
+project(cpu_copy LANGUAGES CXX)
+
+find_package(CUDAToolkit QUIET)
+
+# Cuda environment
+if(CUDAToolkit_FOUND)
+    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
+
+    include(../cuda_common.cmake)
+    add_executable(cpu_copy cpu_copy.cpp)
+    set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
+    target_link_libraries(cpu_copy numa)
+else()
+    # ROCm environment
+    include(../rocm_common.cmake)
+    find_package(hip QUIET)
+    if(hip_FOUND)
+        message(STATUS "Found ROCm: " ${HIP_VERSION})
+
+        # Convert cuda code to hip code in cpp
+        execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+        # link hip device lib
+        add_executable(cpu_copy cpu_copy.cpp)
+
+        include(CheckSymbolExists)
+        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+        if(${HIP_UNCACHED_MEMORY})
+            target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
+        endif()
+
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+        target_link_libraries(cpu_copy numa hip::device)
+    else()
+        message(FATAL_ERROR "No CUDA or ROCm environment found.")
+    endif()
+endif()
+
+install(TARGETS cpu_copy RUNTIME DESTINATION bin)
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cpp
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cpp
+#include <chrono>
+#include <cstring> // for memcpy
+#include <getopt.h>
+#include <iomanip> // for setting precision
+#include <iostream>
+#include <numa.h>
+#include <numeric>
+#include <vector>
+
+// Options accepted by this program.
+struct Opts {
+    // Data buffer size for copy benchmark.
+    uint64_t size = 0;
+
+    // Number of warm up rounds to run.
+    uint64_t num_warm_up = 0;
+
+    // Number of loops to run.
+    uint64_t num_loops = 0;
+
+    // Whether check data after copy.
+    bool check_data = false;
+};
+
+/**
+ * @brief Print the usage instructions for this program.
+ *
+ * This function outputs the correct way to execute the program,
+ * including any necessary command-line arguments and their descriptions.
+ */
+void PrintUsage() {
+    std::cout << "Usage: cpu_copy "
+              << "--size <size> "
+              << "--num_warm_up <num_warm_up> "
+              << "--num_loops <num_loops> "
+              << "[--check_data]" << std::endl;
+}
+
+/**
+ * @brief Checks if the system has memory available for a specific NUMA node.
+ *
+ * This function determines whether there is memory available on the specified
+ * NUMA (Non-Uniform Memory Access) node.
+ *
+ * Empty NUMA nodes in Grace CPU are reserved for multi-instance GPUs (MIG).
+ *
+ * @param node The identifier of the NUMA node to check.
+ * @return true if the specified NUMA node has sufficient memory available, false otherwise.
+ */
+bool HasMemForNumaNode(int node) {
+    try {
+        long free_memory = numa_node_size64(node, nullptr);
+        return free_memory > 0;
+    } catch (const std::exception &e) {
+        std::cerr << "Failed to get memory size for NUMA node " << node << ". ERROR: " << e.what() << std::endl;
+        return false;
+    }
+}
+
+/**
+ * @brief Checks if the system has CPUs available for a specific NUMA node.
+ *
+ * This function determines whether there are CPUs available on the specified
+ * NUMA (Non-Uniform Memory Access) node. It is useful for ensuring that CPU
+ * affinity can be set to the desired NUMA node, which can help optimize memory
+ * access patterns and performance in NUMA-aware applications.
+ *
+ * Memory-only or empty NUMA nodes in Grace CPU are for GPUs.
+ *
+ * @param node The identifier of the NUMA node to check.
+ * @return true if the specified NUMA node has CPUs available, false otherwise.
+ */
+bool HasCPUsForNumaNode(int node) {
+    struct bitmask *bm = numa_allocate_cpumask();
+
+    int numa_err = numa_node_to_cpus(node, bm);
+    if (numa_err != 0) {
+        std::cerr << "Failed to get CPU mask for NUMA node " << node << ". ERROR: " << strerror(errno) << std::endl;
+
+        numa_bitmask_free(bm);
+        return false; // On error
+    }
+
+    // Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes
+    bool has_cpus = (numa_bitmask_weight(bm) > 0);
+    numa_bitmask_free(bm);
+    return has_cpus;
+}
+
+/**
+ * @brief Parses command-line options for the CPU copy performance benchmark.
+ *
+ * This function processes the command-line arguments provided to the benchmark
+ * and sets the appropriate configuration options based on the input.
+ *
+ * @param argc The number of command-line arguments.
+ * @param argv The array of command-line arguments.
+ * @return An integer indicating the success or failure of the option parsing.
+ *         Returns 0 on success, and a non-zero value on failure.
+ */
+/**/
+int ParseOpts(int argc, char **argv, Opts *opts) {
+    enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
+    const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
+                                     {"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
+                                     {"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
+                                     {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
+    int getopt_ret = 0;
+    int opt_idx = 0;
+    bool size_specified = false;
+    bool num_warm_up_specified = false;
+    bool num_loops_specified = false;
+    bool parse_err = false;
+
+    while (true) {
+        getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
+        if (getopt_ret == -1) {
+            if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
+                parse_err = true;
+            }
+            break;
+        } else if (getopt_ret == '?') {
+            parse_err = true;
+            break;
+        }
+        switch (opt_idx) {
+        case static_cast<int>(OptIdx::kSize):
+            if (1 != sscanf(optarg, "%lu", &(opts->size))) {
+                std::cerr << "Invalid size: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                size_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kNumWarmUp):
+            if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
+                std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                num_warm_up_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kNumLoops):
+            if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
+                std::cerr << "Invalid num_loops: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                num_loops_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kEnableCheckData):
+            opts->check_data = true;
+            break;
+        default:
+            parse_err = true;
+        }
+        if (parse_err) {
+            break;
+        }
+    }
+
+    if (parse_err) {
+        PrintUsage();
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * @brief Benchmark the memory copy performance between two NUMA nodes.
+ *
+ * This function measures the performance of copying memory from a source NUMA node to a destination NUMA node.
+ *
+ * @param src_node The source NUMA node from which memory will be copied.
+ * @param dst_node The destination NUMA node to which memory will be copied.
+ * @param opts A reference to an Opts structure containing various options and configurations for the benchmark.
+ * @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency.
+ */
+double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) {
+    int ret = 0;
+
+    // Set CPU affinity to the NUMA node with CPU cores assoiated
+    int affinity_node = HasCPUsForNumaNode(src_node) ? src_node : dst_node;
+    ret = numa_run_on_node(affinity_node);
+    if (ret != 0) {
+        std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl;
+        return 0;
+    }
+
+    // Allocate memory on the source and destination NUMA nodes
+    char *src = (char *)numa_alloc_onnode(opts.size, src_node);
+    if (!src) {
+        std::cerr << "Memory allocation failed on node" << src_node << std::endl;
+        return 0;
+    }
+
+    char *dst = (char *)numa_alloc_onnode(opts.size, dst_node);
+    if (!dst) {
+        std::cerr << "Memory allocation failed on node" << dst_node << std::endl;
+        return 0;
+    }
+
+    // Initialize the source memory with some data
+    memset(src, 1, opts.size);
+
+    // Measure the time taken for memcpy between nodes
+    auto start = std::chrono::high_resolution_clock::now();
+
+    // Perform the memory copy
+    memcpy(dst, src, opts.size);
+
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+
+    // Calculate the latency (nanoseconds per byte)
+    double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds
+
+    // Free the allocated memory
+    numa_free(src, opts.size);
+    numa_free(dst, opts.size);
+
+    if (opts.check_data) {
+        // Check the data integrity after the copy
+        if (memcmp(src, dst, opts.size) != 0) {
+            std::cerr << "Data integrity check failed!" << dst_node << std::endl;
+
+            return -1;
+        }
+    }
+
+    return total_time_ns;
+}
+
+/**
+ * @brief Runs the CPU copy benchmark between all pairs of NUMA nodes.
+ *
+ * This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system.
+ * It calculates the average bandwidth and latency for each pair of nodes and outputs the results.
+ *
+ * @param src_node The source NUMA node from which data will be copied.
+ * @param dst_node The destination NUMA node to which data will be copied.
+ * @param opts A reference to an Opts object containing various options and configurations for the benchmark.
+ */
+double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) {
+    // Run warm up rounds
+    for (int i = 0; i < opts.num_warm_up; i++) {
+        BenchmarkNUMACopy(src_node, dst_node, opts);
+    }
+
+    double time_used_ns = 0;
+
+    for (int i = 0; i < opts.num_loops; i++) {
+        time_used_ns += BenchmarkNUMACopy(src_node, dst_node, opts);
+    }
+
+    return time_used_ns / opts.num_loops;
+}
+
+int main(int argc, char **argv) {
+    Opts opts;
+    int ret = -1;
+    ret = ParseOpts(argc, argv, &opts);
+    if (0 != ret) {
+        return ret;
+    }
+
+    // Check if the system has multiple NUMA nodes
+    if (-1 == numa_available()) {
+        std::cerr << "NUMA is not available on this system!" << std::endl;
+        return 1;
+    }
+
+    int num_of_numa_nodes = numa_num_configured_nodes();
+
+    if (num_of_numa_nodes < 2) {
+        std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl;
+        return 1;
+    }
+
+    // Run the benchmark
+    for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) {
+        if (!HasMemForNumaNode(src_node)) {
+            // Skip the NUMA node if there are no memory available
+            continue;
+        }
+
+        for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) {
+            if (src_node == dst_node) {
+                // Skip the same NUMA node
+                continue;
+            }
+
+            if (!HasMemForNumaNode(dst_node)) {
+                // Skip the NUMA node if there are no memory available
+                continue;
+            }
+
+            //
+            if (!HasCPUsForNumaNode(src_node) && !HasCPUsForNumaNode(dst_node)) {
+                // Skip the process if there are no CPUs available on both NUMA nodes
+                continue;
+            }
+
+            double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts);
+            double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s
+            double latency = time_used_ns / opts.size;          // ns/byte
+
+            // Output the result
+            std::cout << "mem_bandwidth_matrix_numa_" << src_node << "_" << dst_node << "_bw: " << std::setprecision(9)
+                      << bw << std::endl;
+            std::cout << "mem_bandwidth_matrix_numa_" << src_node << "_" << dst_node << "_lat: " << std::setprecision(9)
+                      << latency << std::endl;
+        }
+    }
+
+    return 0;
+}
--- a/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-"""Module for running the Intel MLC tool to measure memory bandwidth and latency."""
+"""Module to measure memory bandwidth and latency."""

 import os
+import platform

 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, ReturnCode
@@ -21,13 +22,14 @@ def __init__(self, name, parameters=''):
        """
        super().__init__(name, parameters)

-        self._bin_name = 'mlc'
+        self._bin_name = 'mlc' if 'x86_64' in platform.machine() else 'cpu_copy'
        self.__support_mlc_commands = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth']

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()

+        # Add arguments for the Intel MLC tool.
        self._parser.add_argument(
            '--tests',
            type=str,
@@ -37,15 +39,39 @@ def add_parser_arguments(self):
            help='The modes to run mlc with. Possible values are {}.'.format(' '.join(self.__support_mlc_commands))
        )

-    def _preprocess(self):
-        """Preprocess/preparation operations before the benchmarking.
+        # Add arguments for the general CPU copy benchmark.
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=256 * 1024**2,
+            required=False,
+            help='Size of data buffer in bytes for non mlc benchmark. Default is 256MB.',
+        )

-        Return:
-            True if _preprocess() succeed.
-        """
-        if not super()._preprocess():
-            return False
+        self._parser.add_argument(
+            '--num_warm_up',
+            type=int,
+            default=20,
+            required=False,
+            help='Number of warm up rounds for non mlc benchmark. Default is 20.',
+        )
+
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            default=100,
+            required=False,
+            help='Number of data buffer copies performed for non mlc benchmark. Default is 100.',
+        )

+        self._parser.add_argument(
+            '--check_data',
+            action='store_true',
+            help='Enable data checking for non mlc benchmark. Default is False.',
+        )
+
+    def _preprocess_mlc(self):
+        """Preprocess/preparation operations for the Intel MLC tool."""
        mlc_path = os.path.join(self._args.bin_dir, self._bin_name)
        ret_val = os.access(mlc_path, os.X_OK | os.F_OK)
        if not ret_val:
@@ -66,18 +92,36 @@ def _preprocess(self):
            self._commands.append(mlc_wrapper % command)
        return True

-    def _process_raw_result(self, cmd_idx, raw_output):
-        """Function to parse raw results and save the summarized results.
+    def _preprocess_general(self):
+        """Preprocess/preparation operations for the general CPU copy benchmark."""
+        # TODO: enable hugepages?

-          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)

-        Args:
-            cmd_idx (int): the index of command corresponding with the raw_output.
-            raw_output (str): raw output string of the micro-benchmark.
+        args = '--size %d --num_warm_up %d --num_loops %d' % (
+            self._args.size, self._args.num_warm_up, self._args.num_loops
+        )
+
+        if self._args.check_data:
+            args += ' --check_data'
+
+        self._commands = ['%s %s' % (self.__bin_path, args)]
+
+        return True
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.

        Return:
-            True if the raw output string is valid and result can be extracted.
+            True if _preprocess() succeed.
        """
+        if not super()._preprocess():
+            return False
+
+        return self._preprocess_mlc() if 'x86_64' in platform.machine() else self._preprocess_general()
+
+    def _process_raw_result_mlc(self, cmd_idx, raw_output):
+        """Function to parse raw results for the Intel MLC tool and save the summarized results."""
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)

        # parse the command to see which command this output belongs to
@@ -116,8 +160,46 @@ def _process_raw_result(self, cmd_idx, raw_output):
                else:
                    metric = 'mem_{}_{}_{}_{}'.format(mlc_test, key, str(index), measure).lower()
                self._result.add_result(metric, float(out_table[key][index]))
+
        return True

+    def _process_raw_result_general(self, cmd_idx, raw_output):
+        """Function to parse raw results for the general CPU copy benchmark and save the summarized results."""
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+
+        try:
+            for output_line in raw_output.strip().splitlines():
+                name, value = output_line.split(':')
+                self._result.add_result(name.strip(), float(value.strip()))
+        except BaseException as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+
+            return False
+
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        return (
+            self._process_raw_result_mlc(cmd_idx, raw_output)
+            if 'x86_64' in platform.machine() else self._process_raw_result_general(cmd_idx, raw_output)
+        )
+
    def _parse_bw_latency(self, raw_output):
        out_table = dict()
        for line in raw_output.splitlines():

--- a/tests/benchmarks/micro_benchmarks/test_cpu_memory_bw_latency_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cpu_memory_bw_latency_performance.py
@@ -4,6 +4,7 @@
 """Tests for cpu-memory-bw-latency benchmark."""

 import unittest
+from unittest import mock

 from tests.helper.testcase import BenchmarkTestCase
 from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
@@ -17,6 +18,7 @@ def setUpClass(cls):
        super().setUpClass()
        cls.createMockEnvs(cls)
        cls.createMockFiles(cls, ['bin/mlc'])
+        cls.createMockFiles(cls, ['bin/cpu_copy'])

    def test_cpu_mem_bw_latency_benchmark_empty_param(self):
        """Test cpu-memory-bw-latency benchmark command generation with empty parameter."""
@@ -148,3 +150,25 @@ def test_cpu_mem_bw_latency_benchmark_result_parsing(self):
        # Negative case - invalid raw output.
        assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
        assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+
+    @mock.patch('platform.machine')
+    def test_preprocess_non_x86(self, mock_platform_machine):
+        """Test _preprocess method for general CPU copy benchmark."""
+        mock_platform_machine.return_value = 'arm64'
+
+        benchmark_name = 'cpu-memory-bw-latency'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(
+            benchmark_name, parameters='--size 1024 --num_warm_up 10 --num_loops 50 --check_data'
+        )
+        benchmark._bin_name = 'cpu_copy'
+        benchmark._commands = []
+
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (len(benchmark._commands) == 1)
+        assert ('cpu_copy --size 1024 --num_warm_up 10 --num_loops 50 --check_data' in benchmark._commands[0])