Unverified Commit 9c35e80a authored by Hongtao Zhang's avatar Hongtao Zhang Committed by GitHub
Browse files

Benchmarks: micro benchmarks - add general CPU bandwidth and latency benchmark (#662)



**Description**
Add micro benchmark to measure general CPU bandwidth and latency without 'mlc'.

Test output:
```
{
"cpu-memory-bw-latency/return_code": 0,
"cpu-memory-bw-latency/mem_bandwidth_matrix_numa_0_1_bw": 5388.75021,
"cpu-memory-bw-latency/mem_bandwidth_matrix_numa_0_1_lat": 0.185571786,
"cpu-memory-bw-latency/mem_bandwidth_matrix_numa_1_0_bw": 4634.82028,
"cpu-memory-bw-latency/mem_bandwidth_matrix_numa_1_0_lat": 0.215758096,
}
```

---------
Co-authored-by: default avatarhongtaozhang <hongtaozhang@microsoft.com>
parent a8a7bed2
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.18)
project(cpu_copy LANGUAGES CXX)
find_package(CUDAToolkit QUIET)
# Cuda environment
if(CUDAToolkit_FOUND)
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
include(../cuda_common.cmake)
add_executable(cpu_copy cpu_copy.cpp)
set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_link_libraries(cpu_copy numa)
else()
# ROCm environment
include(../rocm_common.cmake)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION})
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# link hip device lib
add_executable(cpu_copy cpu_copy.cpp)
include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
target_link_libraries(cpu_copy numa hip::device)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif()
install(TARGETS cpu_copy RUNTIME DESTINATION bin)
#include <chrono>
#include <cstring> // for memcpy
#include <getopt.h>
#include <iomanip> // for setting precision
#include <iostream>
#include <numa.h>
#include <numeric>
#include <vector>
// Options accepted by this program.
struct Opts {
// Data buffer size for copy benchmark.
uint64_t size = 0;
// Number of warm up rounds to run.
uint64_t num_warm_up = 0;
// Number of loops to run.
uint64_t num_loops = 0;
// Whether check data after copy.
bool check_data = false;
};
/**
* @brief Print the usage instructions for this program.
*
* This function outputs the correct way to execute the program,
* including any necessary command-line arguments and their descriptions.
*/
void PrintUsage() {
std::cout << "Usage: cpu_copy "
<< "--size <size> "
<< "--num_warm_up <num_warm_up> "
<< "--num_loops <num_loops> "
<< "[--check_data]" << std::endl;
}
/**
* @brief Checks if the system has memory available for a specific NUMA node.
*
* This function determines whether there is memory available on the specified
* NUMA (Non-Uniform Memory Access) node.
*
* Empty NUMA nodes in Grace CPU are reserved for multi-instance GPUs (MIG).
*
* @param node The identifier of the NUMA node to check.
* @return true if the specified NUMA node has sufficient memory available, false otherwise.
*/
bool HasMemForNumaNode(int node) {
try {
long free_memory = numa_node_size64(node, nullptr);
return free_memory > 0;
} catch (const std::exception &e) {
std::cerr << "Failed to get memory size for NUMA node " << node << ". ERROR: " << e.what() << std::endl;
return false;
}
}
/**
* @brief Checks if the system has CPUs available for a specific NUMA node.
*
* This function determines whether there are CPUs available on the specified
* NUMA (Non-Uniform Memory Access) node. It is useful for ensuring that CPU
* affinity can be set to the desired NUMA node, which can help optimize memory
* access patterns and performance in NUMA-aware applications.
*
* Memory-only or empty NUMA nodes in Grace CPU are for GPUs.
*
* @param node The identifier of the NUMA node to check.
* @return true if the specified NUMA node has CPUs available, false otherwise.
*/
bool HasCPUsForNumaNode(int node) {
struct bitmask *bm = numa_allocate_cpumask();
int numa_err = numa_node_to_cpus(node, bm);
if (numa_err != 0) {
std::cerr << "Failed to get CPU mask for NUMA node " << node << ". ERROR: " << strerror(errno) << std::endl;
numa_bitmask_free(bm);
return false; // On error
}
// Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes
bool has_cpus = (numa_bitmask_weight(bm) > 0);
numa_bitmask_free(bm);
return has_cpus;
}
/**
* @brief Parses command-line options for the CPU copy performance benchmark.
*
* This function processes the command-line arguments provided to the benchmark
* and sets the appropriate configuration options based on the input.
*
* @param argc The number of command-line arguments.
* @param argv The array of command-line arguments.
* @return An integer indicating the success or failure of the option parsing.
* Returns 0 on success, and a non-zero value on failure.
*/
/**/
int ParseOpts(int argc, char **argv, Opts *opts) {
enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
{"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
{"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
int getopt_ret = 0;
int opt_idx = 0;
bool size_specified = false;
bool num_warm_up_specified = false;
bool num_loops_specified = false;
bool parse_err = false;
while (true) {
getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
if (getopt_ret == -1) {
if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
parse_err = true;
}
break;
} else if (getopt_ret == '?') {
parse_err = true;
break;
}
switch (opt_idx) {
case static_cast<int>(OptIdx::kSize):
if (1 != sscanf(optarg, "%lu", &(opts->size))) {
std::cerr << "Invalid size: " << optarg << std::endl;
parse_err = true;
} else {
size_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumWarmUp):
if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
parse_err = true;
} else {
num_warm_up_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumLoops):
if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
std::cerr << "Invalid num_loops: " << optarg << std::endl;
parse_err = true;
} else {
num_loops_specified = true;
}
break;
case static_cast<int>(OptIdx::kEnableCheckData):
opts->check_data = true;
break;
default:
parse_err = true;
}
if (parse_err) {
break;
}
}
if (parse_err) {
PrintUsage();
return -1;
}
return 0;
}
/**
* @brief Benchmark the memory copy performance between two NUMA nodes.
*
* This function measures the performance of copying memory from a source NUMA node to a destination NUMA node.
*
* @param src_node The source NUMA node from which memory will be copied.
* @param dst_node The destination NUMA node to which memory will be copied.
* @param opts A reference to an Opts structure containing various options and configurations for the benchmark.
* @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency.
*/
double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) {
int ret = 0;
// Set CPU affinity to the NUMA node with CPU cores assoiated
int affinity_node = HasCPUsForNumaNode(src_node) ? src_node : dst_node;
ret = numa_run_on_node(affinity_node);
if (ret != 0) {
std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl;
return 0;
}
// Allocate memory on the source and destination NUMA nodes
char *src = (char *)numa_alloc_onnode(opts.size, src_node);
if (!src) {
std::cerr << "Memory allocation failed on node" << src_node << std::endl;
return 0;
}
char *dst = (char *)numa_alloc_onnode(opts.size, dst_node);
if (!dst) {
std::cerr << "Memory allocation failed on node" << dst_node << std::endl;
return 0;
}
// Initialize the source memory with some data
memset(src, 1, opts.size);
// Measure the time taken for memcpy between nodes
auto start = std::chrono::high_resolution_clock::now();
// Perform the memory copy
memcpy(dst, src, opts.size);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
// Calculate the latency (nanoseconds per byte)
double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds
// Free the allocated memory
numa_free(src, opts.size);
numa_free(dst, opts.size);
if (opts.check_data) {
// Check the data integrity after the copy
if (memcmp(src, dst, opts.size) != 0) {
std::cerr << "Data integrity check failed!" << dst_node << std::endl;
return -1;
}
}
return total_time_ns;
}
/**
* @brief Runs the CPU copy benchmark between all pairs of NUMA nodes.
*
* This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system.
* It calculates the average bandwidth and latency for each pair of nodes and outputs the results.
*
* @param src_node The source NUMA node from which data will be copied.
* @param dst_node The destination NUMA node to which data will be copied.
* @param opts A reference to an Opts object containing various options and configurations for the benchmark.
*/
double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) {
// Run warm up rounds
for (int i = 0; i < opts.num_warm_up; i++) {
BenchmarkNUMACopy(src_node, dst_node, opts);
}
double time_used_ns = 0;
for (int i = 0; i < opts.num_loops; i++) {
time_used_ns += BenchmarkNUMACopy(src_node, dst_node, opts);
}
return time_used_ns / opts.num_loops;
}
int main(int argc, char **argv) {
Opts opts;
int ret = -1;
ret = ParseOpts(argc, argv, &opts);
if (0 != ret) {
return ret;
}
// Check if the system has multiple NUMA nodes
if (-1 == numa_available()) {
std::cerr << "NUMA is not available on this system!" << std::endl;
return 1;
}
int num_of_numa_nodes = numa_num_configured_nodes();
if (num_of_numa_nodes < 2) {
std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl;
return 1;
}
// Run the benchmark
for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) {
if (!HasMemForNumaNode(src_node)) {
// Skip the NUMA node if there are no memory available
continue;
}
for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) {
if (src_node == dst_node) {
// Skip the same NUMA node
continue;
}
if (!HasMemForNumaNode(dst_node)) {
// Skip the NUMA node if there are no memory available
continue;
}
//
if (!HasCPUsForNumaNode(src_node) && !HasCPUsForNumaNode(dst_node)) {
// Skip the process if there are no CPUs available on both NUMA nodes
continue;
}
double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts);
double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s
double latency = time_used_ns / opts.size; // ns/byte
// Output the result
std::cout << "mem_bandwidth_matrix_numa_" << src_node << "_" << dst_node << "_bw: " << std::setprecision(9)
<< bw << std::endl;
std::cout << "mem_bandwidth_matrix_numa_" << src_node << "_" << dst_node << "_lat: " << std::setprecision(9)
<< latency << std::endl;
}
}
return 0;
}
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module for running the Intel MLC tool to measure memory bandwidth and latency."""
"""Module to measure memory bandwidth and latency."""
import os
import platform
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
......@@ -21,13 +22,14 @@ def __init__(self, name, parameters=''):
"""
super().__init__(name, parameters)
self._bin_name = 'mlc'
self._bin_name = 'mlc' if 'x86_64' in platform.machine() else 'cpu_copy'
self.__support_mlc_commands = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth']
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
# Add arguments for the Intel MLC tool.
self._parser.add_argument(
'--tests',
type=str,
......@@ -37,15 +39,39 @@ def add_parser_arguments(self):
help='The modes to run mlc with. Possible values are {}.'.format(' '.join(self.__support_mlc_commands))
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
# Add arguments for the general CPU copy benchmark.
self._parser.add_argument(
'--size',
type=int,
default=256 * 1024**2,
required=False,
help='Size of data buffer in bytes for non mlc benchmark. Default is 256MB.',
)
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
self._parser.add_argument(
'--num_warm_up',
type=int,
default=20,
required=False,
help='Number of warm up rounds for non mlc benchmark. Default is 20.',
)
self._parser.add_argument(
'--num_loops',
type=int,
default=100,
required=False,
help='Number of data buffer copies performed for non mlc benchmark. Default is 100.',
)
self._parser.add_argument(
'--check_data',
action='store_true',
help='Enable data checking for non mlc benchmark. Default is False.',
)
def _preprocess_mlc(self):
"""Preprocess/preparation operations for the Intel MLC tool."""
mlc_path = os.path.join(self._args.bin_dir, self._bin_name)
ret_val = os.access(mlc_path, os.X_OK | os.F_OK)
if not ret_val:
......@@ -66,18 +92,36 @@ def _preprocess(self):
self._commands.append(mlc_wrapper % command)
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
def _preprocess_general(self):
"""Preprocess/preparation operations for the general CPU copy benchmark."""
# TODO: enable hugepages?
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
args = '--size %d --num_warm_up %d --num_loops %d' % (
self._args.size, self._args.num_warm_up, self._args.num_loops
)
if self._args.check_data:
args += ' --check_data'
self._commands = ['%s %s' % (self.__bin_path, args)]
return True
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if the raw output string is valid and result can be extracted.
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
return self._preprocess_mlc() if 'x86_64' in platform.machine() else self._preprocess_general()
def _process_raw_result_mlc(self, cmd_idx, raw_output):
"""Function to parse raw results for the Intel MLC tool and save the summarized results."""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
# parse the command to see which command this output belongs to
......@@ -116,8 +160,46 @@ def _process_raw_result(self, cmd_idx, raw_output):
else:
metric = 'mem_{}_{}_{}_{}'.format(mlc_test, key, str(index), measure).lower()
self._result.add_result(metric, float(out_table[key][index]))
return True
def _process_raw_result_general(self, cmd_idx, raw_output):
"""Function to parse raw results for the general CPU copy benchmark and save the summarized results."""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
try:
for output_line in raw_output.strip().splitlines():
name, value = output_line.split(':')
self._result.add_result(name.strip(), float(value.strip()))
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
self._curr_run_index, self._name, raw_output, str(e)
)
)
return False
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
return (
self._process_raw_result_mlc(cmd_idx, raw_output)
if 'x86_64' in platform.machine() else self._process_raw_result_general(cmd_idx, raw_output)
)
def _parse_bw_latency(self, raw_output):
out_table = dict()
for line in raw_output.splitlines():
......
......@@ -4,6 +4,7 @@
"""Tests for cpu-memory-bw-latency benchmark."""
import unittest
from unittest import mock
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
......@@ -17,6 +18,7 @@ def setUpClass(cls):
super().setUpClass()
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/mlc'])
cls.createMockFiles(cls, ['bin/cpu_copy'])
def test_cpu_mem_bw_latency_benchmark_empty_param(self):
"""Test cpu-memory-bw-latency benchmark command generation with empty parameter."""
......@@ -148,3 +150,25 @@ def test_cpu_mem_bw_latency_benchmark_result_parsing(self):
# Negative case - invalid raw output.
assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
@mock.patch('platform.machine')
def test_preprocess_non_x86(self, mock_platform_machine):
"""Test _preprocess method for general CPU copy benchmark."""
mock_platform_machine.return_value = 'arm64'
benchmark_name = 'cpu-memory-bw-latency'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
assert (benchmark_class)
benchmark = benchmark_class(
benchmark_name, parameters='--size 1024 --num_warm_up 10 --num_loops 50 --check_data'
)
benchmark._bin_name = 'cpu_copy'
benchmark._commands = []
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (len(benchmark._commands) == 1)
assert ('cpu_copy --size 1024 --num_warm_up 10 --num_loops 50 --check_data' in benchmark._commands[0])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment