Unverified Commit d4f6ceb7 authored by Bengt Lüers's avatar Bengt Lüers Committed by GitHub
Browse files

Merge pull request #1 from RadeonOpenCompute/master

Pull upstream
parents be6a8ba0 981810fe
......@@ -27,6 +27,12 @@ cmake_minimum_required(VERSION 2.8.0)
#
#
# Specify name of project to build, install and package
set(PROJECT_NAME "rocm-bandwidth-test")
set(TEST_NAME "${PROJECT_NAME}")
project(${PROJECT_NAME})
# Build is not supported on Windows plaform
if(WIN32)
message("Windows platfom is not supported")
......@@ -56,7 +62,6 @@ endif()
set(CMAKE_CXX_FLAGS "-std=c++11")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-dev")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
......@@ -103,61 +108,11 @@ elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
endif()
# Specify name of project to build, install and package
set(PROJECT_NAME "rocm_bandwidth_test")
set(TEST_NAME "${PROJECT_NAME}")
project(${PROJECT_NAME})
# Set project requirements
set(ROC_THUNK_NAME "hsakmt")
set(CORE_RUNTIME_NAME "hsa-runtime")
set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64")
# Bind default root directory to look for ROCm artifacts
# such as ROCr header and ROCr, ROCt libraries
set(ROCM_ROOT /opt/rocm/ CACHE PATH "Root of ROCm")
if(DEFINED ENV{CMAKE_PREFIX_PATH})
set(CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH})
endif()
if(CMAKE_PREFIX_PATH)
set(ROCM_ROOT ${CMAKE_PREFIX_PATH} CACHE PATH "Root of ROCm")
endif()
# Search for ROCr header file
find_path(ROCR_HDR hsa/hsa.h PATHS ${ROCM_ROOT} PATH_SUFFIXES include )
if (NOT ROCR_HDR)
message("Rocr Header hsa/hsa.h not found")
return()
endif()
# Add directories to look for header files to compile
INCLUDE_DIRECTORIES(${ROCR_HDR})
# Search for ROCr library file
find_library(ROCR_LIB ${CORE_RUNTIME_TARGET} PATHS ${ROCM_ROOT} PATH_SUFFIXES lib lib64)
if (NOT ROCR_LIB)
message("Rocr Library ${CORE_RUNTIME_TARGET} not found")
return()
endif()
# Search for ROCr library file
find_library(ROCT_LIB ${ROC_THUNK_NAME} PATHS ${ROCM_ROOT} PATH_SUFFIXES lib lib64)
if (NOT ROCT_LIB)
message("Roct Library ${ROC_THUNK_NAME} not found")
return()
endif()
# Add ROCr library to be used in linking target
add_library(${CORE_RUNTIME_TARGET} SHARED IMPORTED GLOBAL)
set_target_properties(${CORE_RUNTIME_TARGET} PROPERTIES
IMPORTED_LOCATION "${ROCR_LIB}"
INTERFACE_INCLUDE_DIRECTORIES "${ROCR_HDR}")
# Add ROCr library to be used in linking target
add_library(${ROC_THUNK_NAME} SHARED IMPORTED GLOBAL)
set_target_properties(${ROC_THUNK_NAME} PROPERTIES
IMPORTED_LOCATION "${ROCT_LIB}")
# Add cmake_modules to default module path if it is not
# already set and include utils from cmake modules
if(NOT DEFINED CMAKE_MODULE_PATH)
......@@ -165,12 +120,29 @@ if(NOT DEFINED CMAKE_MODULE_PATH)
endif()
include(utils)
# Making find_package(has-runtime64 Optional as it can fail when building old hsa
# When find_package fails, then using old method of find_libraries for
# searching the required libs for building RBT
find_package(hsa-runtime64
PATHS /opt/rocm )
if(${hsa-runtime64_FOUND})
message("hsa-runtime64 found @ ${hsa-runtime64_DIR} " )
else()
message("hsa-runtime64 NOT found Resolving to OLD Way" )
find_path(ROCR_HDR hsa.h PATHS "/opt/rocm" PATH_SUFFIXES include/hsa REQUIRED )
INCLUDE_DIRECTORIES(${ROCR_HDR})
# Search for ROCr library file
find_library(ROCR_LIB ${CORE_RUNTIME_TARGET} PATHS "/opt/rocm" PATH_SUFFIXES lib lib64 REQUIRED)
# Search for ROCt library file
find_library(ROCT_LIB ${ROC_THUNK_NAME} PATHS "/opt/rocm" PATH_SUFFIXES lib lib64 REQUIRED)
endif()
#
# Set the package version for the test. It is critical that this
# value track what is used in the test source. The code from utils
# module will parse the string into major, minor and patch sub-fields
#
get_version("1.0.0")
get_version()
# Bind the Major, Minor and Patch values
set(BUILD_VERSION_MAJOR ${VERSION_MAJOR})
......@@ -192,9 +164,18 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} Src)
# Build and link the test program
add_executable(${TEST_NAME} ${Src})
target_link_libraries(${TEST_NAME} ${ROC_THUNK_NAME})
target_link_libraries(${TEST_NAME} ${CORE_RUNTIME_TARGET})
target_link_libraries(${TEST_NAME} c stdc++ dl pthread rt)
if(${hsa-runtime64_FOUND})
target_link_libraries(${TEST_NAME} PRIVATE hsa-runtime64::hsa-runtime64)
else()
target_link_libraries(${TEST_NAME} PRIVATE ${ROCR_LIB} ${ROCT_LIB} )
endif()
target_link_libraries(${TEST_NAME} PRIVATE c stdc++ dl pthread rt)
# Update linker flags to include RPATH
# Add --enable-new-dtags to generate DT_RUNPATH
if( DEFINED ENV{ROCM_RPATH})
set ( CMAKE_EXE_LINKER_FLAGS "-Wl,--enable-new-dtags -Wl,--rpath,$ENV{ROCM_RPATH}" )
endif()
# Add install directives for rocm_bandwidth_test
install(TARGETS ${TEST_NAME} RUNTIME DESTINATION bin)
......@@ -209,8 +190,12 @@ set(CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc.")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Test to measure PciE bandwidth on ROCm platforms")
# Debian package specific variables
set(CPACK_DEBIAN_PACKAGE_DEPENDS "libstdc++6, hsa-rocr-dev" )
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/rocm_bandwidth_test")
# RPM package specific variables
set(CPACK_RPM_PACKAGE_DEPENDS "libstdc++6, hsa-rocr-dev" )
# RPM package specific variables
if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX)
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "${CPACK_PACKAGING_INSTALL_PREFIX} ${CPACK_PACKAGING_INSTALL_PREFIX}/bin")
......
File mode changed from 100644 to 100755
......@@ -31,16 +31,23 @@ The following simply lists the steps to build RocBandwidthTest
The argument for cmake should be the root folder of RocBandwidthTest
test suite
// Assume that Roc Runtime has its libraries & headers are located in the path :
// libraries : _ABSOLUTE_PATH_TO_ROCR_LIBS_/lib
// headers : _ABSOLUTE_PATH_TO_ROCR_LIBS_/include/hsa
// Note : Observe that both include & lib folder are under common path (_ABSOLUTE_PATH_TO_ROCR_LIBS_)
// Builds Debug version
// Assumes pwd is .../roc_bandwidth_test/build
e.g. cmake -DROCR_LIB_DIR="Path of ROC Runtime Library Files" \
-DROCR_INC_DIR="Path of ROC Runtime Header Files" \
-DCMAKE_BUILD_TYPE:STRING=Debug ..
// Assumes pwd is _ABSOLUTE_PATH_TO_RBT/roc_bandwidth_test/build
e.g. cmake -DCMAKE_BUILD_TYPE="Debug" \
-DCMAKE_MODULE_PATH="_ABSOLUTE_PATH_TO_RBT/roc_bandwidth_test/cmake_modules" \
-DCMAKE_PREFIX_PATH="_ABSOLUTE_PATH_TO_ROCR_LIBS_"
..
// Builds Release version - default
// Assumes pwd is .../roc_bandwidth_test/build
e.g. cmake -DROCR_LIB_DIR="Path of ROC Runtime Library Files" \
-DROCR_INC_DIR="Path of ROC Runtime Header Files" \
// Assumes pwd is _ABSOLUTE_PATH_TO_RBT/roc_bandwidth_test/build
e.g. cmake -DCMAKE_MODULE_PATH="_ABSOLUTE_PATH_TO_RBT/roc_bandwidth_test/cmake_modules" \
-DCMAKE_PREFIX_PATH="_ABSOLUTE_PATH_TO_ROCR_LIBS_"
..
--- Invoke the native build rules generated by cmake to build the various
......
File mode changed from 100644 to 100755
......@@ -43,7 +43,7 @@
#ifndef ROC_BANDWIDTH_TEST_BASE_H_
#define ROC_BANDWIDTH_TEST_BASE_H_
#include "hsa/hsa.h"
#include "hsa.h"
#include <iostream>
#include <string>
#include <vector>
......@@ -56,7 +56,7 @@ class BaseTest {
public:
BaseTest(size_t num = 3);
BaseTest(size_t num_iter = 4);
virtual ~BaseTest();
......@@ -74,8 +74,8 @@ class BaseTest {
virtual void Display() const = 0;
// @Brief: Set number of iterations to run
void set_num_iteration(size_t num) {
num_iteration_ = num;
void set_num_iteration(size_t num_iter) {
num_iteration_ = num_iter;
return;
}
......
......@@ -45,39 +45,67 @@
## the major, minor and patch variables.
function( parse_version VERSION_STRING )
string ( FIND ${VERSION_STRING} "-" STRING_INDEX )
if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD )
endif ()
string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} )
list ( LENGTH VERSIONS VERSION_COUNT )
if ( ${VERSION_COUNT} GREATER 0)
list ( GET VERSIONS 0 MAJOR )
set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${MAJOR}" )
endif ()
if ( ${VERSION_COUNT} GREATER 1 )
list ( GET VERSIONS 1 MINOR )
set ( VERSION_MINOR ${MINOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" )
endif ()
if ( ${VERSION_COUNT} GREATER 2 )
list ( GET VERSIONS 2 PATCH )
set ( VERSION_PATCH ${PATCH} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" )
endif ()
if ( DEFINED VERSION_BUILD )
set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
endif ()
set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE )
# Get index of '-' character in input string
string ( FIND ${VERSION_STRING} "-" STRING_INDEX )
# If there is string after '-' character, capture
# it in COMMIT_INFO string
if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 COMMIT_INFO )
endif ()
# Parse string into tokens that consist of only numerical
# substrings and obtain it as a list
string ( REGEX MATCHALL "[0123456789]+" TOKENS ${VERSION_STRING} )
list ( LENGTH TOKENS TOKEN_COUNT )
# Get Major Id of the version
if ( ${TOKEN_COUNT} GREATER 0)
list ( GET TOKENS 0 MAJOR )
set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE )
endif ()
# Get Minor Id of the version
if ( ${TOKEN_COUNT} GREATER 1 )
list ( GET TOKENS 1 MINOR )
set ( VERSION_MINOR ${MINOR} PARENT_SCOPE )
endif ()
# Get Patch Id of the version
if ( ${TOKEN_COUNT} GREATER 2 )
list ( GET TOKENS 2 PATCH )
set ( VERSION_PATCH ${PATCH} PARENT_SCOPE )
endif ()
# Return if commit info is not present
if ( NOT DEFINED COMMIT_INFO )
return()
endif()
# Parse Commit string if present into number of
# commits and hash of last commit
string ( FIND ${COMMIT_INFO} "-" STRING_INDEX )
if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${COMMIT_INFO} ${STRING_INDEX} -1 COMMIT_HASH )
endif ()
string ( REGEX MATCHALL "[0123456789]+" TOKENS ${COMMIT_INFO} )
list ( LENGTH TOKENS TOKEN_COUNT )
if ( ${TOKEN_COUNT} GREATER 0)
list ( GET TOKENS 0 COMMIT_CNT )
endif ()
# Add Build Info from Jenkins
set ( ROCM_BUILD_ID "DevBld" CACHE STRING "Local Build Id" FORCE )
if(DEFINED ENV{ROCM_BUILD_ID})
set ( ROCM_BUILD_ID $ENV{ROCM_BUILD_ID} CACHE STRING "Jenkins Build Id" FORCE )
endif()
# Update Version Patch to include Number of Commits and hash of HEAD
set ( VERSION_PATCH "${PATCH}.${COMMIT_CNT}-${ROCM_BUILD_ID}-${COMMIT_HASH}" PARENT_SCOPE )
endfunction ()
......@@ -85,31 +113,27 @@ endfunction ()
## using versioning tags and git describe.
## Passes back a packaging version string
## and a library version string.
function ( get_version DEFAULT_VERSION_STRING )
parse_version ( ${DEFAULT_VERSION_STRING} )
find_program ( GIT NAMES git )
if ( GIT )
execute_process ( COMMAND "git describe --dirty --long --match [0-9]* 2> /dev/null"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_STRING
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if ( ${RESULT} EQUAL 0 )
parse_version ( ${GIT_TAG_STRING} )
endif ()
function ( get_version )
# Bind the program git that will be
# used to query its tag that describes
find_program ( GIT NAMES git )
if ( GIT )
execute_process ( COMMAND git describe --long --match [0-9]*
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_STRING
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if ( ${RESULT} EQUAL 0 )
parse_version ( ${GIT_TAG_STRING} )
endif ()
endif ()
set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE )
set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE )
set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE )
set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE )
set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
# Propagate values bound to parent scope
set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE )
set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE )
set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE )
endfunction()
File mode changed from 100644 to 100755
......@@ -48,8 +48,8 @@
#include <vector>
#include <cmath>
#include <stdio.h>
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa.h"
#include "hsa_ext_amd.h"
using namespace std;
......
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include "hsatimer.hpp"
#define NANOSECONDS_PER_SECOND 1000000000
PerfTimer::PerfTimer() {
freq_in_100mhz = MeasureTSCFreqHz();
}
PerfTimer::~PerfTimer() {
while (!_timers.empty()) {
Timer *temp = _timers.back();
_timers.pop_back();
delete temp;
}
}
// Create a new timer instance and return its index
int PerfTimer::CreateTimer() {
Timer *newTimer = new Timer;
newTimer->_start = 0.0;
newTimer->_clocks = 0.0;
#ifdef __linux__
newTimer->_freq = NANOSECONDS_PER_SECOND;
#endif
// Save the timer object in timer list
_timers.push_back(newTimer);
return (int)(_timers.size() - 1);
}
int PerfTimer::StartTimer(int index) {
if (index >= (int)_timers.size()) {
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef __linux__
// General Linux timing method
#ifndef _AMD
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
_timers[index]->_start =
(long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec;
// AMD Linux timing method
#else
unsigned int unused;
_timers[index]->_start = __rdtscp(&unused);
#endif
#endif
return HSA_SUCCESS;
}
int PerfTimer::StopTimer(int index) {
long long n = 0;
if (index >= (int)_timers.size()) {
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef __linux__
// General Linux timing method
#ifndef _AMD
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
n = (long long)s.tv_sec * NANOSECONDS_PER_SECOND + (long long)s.tv_nsec;
// AMD Linux timing
#else
unsigned int unused;
n = __rdtscp(&unused);
#endif
#endif
n -= _timers[index]->_start;
_timers[index]->_start = 0;
#ifndef _AMD
_timers[index]->_clocks += n;
#endif
#ifdef __linux__
//_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns
_timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz; // convert to ms
// cout << "_AMD is enabled!!!" << endl;
#endif
return HSA_SUCCESS;
}
void PerfTimer::Error(string str) { cout << str << endl; }
double PerfTimer::ReadTimer(int index) {
if (index >= (int)_timers.size()) {
Error("Cannot read timer. Invalid handle.");
return HSA_FAILURE;
}
double reading = double(_timers[index]->_clocks);
reading = double(reading / _timers[index]->_freq);
return reading;
}
void PerfTimer::ResetTimer(int index) {
// Check if index value is over the timer's size
if (index >= (int)_timers.size()) {
Error("Invalid index value\n");
exit(1);
}
_timers[index]->_clocks = 0.0;
_timers[index]->_start = 0.0;
}
uint64_t PerfTimer::CoarseTimestampUs() {
#ifdef __linux__
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
#endif
}
uint64_t PerfTimer::MeasureTSCFreqHz() {
// Make a coarse interval measurement of TSC ticks for 1 gigacycles.
unsigned int unused;
uint64_t tscTicksEnd;
uint64_t coarseBeginUs = CoarseTimestampUs();
uint64_t tscTicksBegin = __rdtscp(&unused);
do {
tscTicksEnd = __rdtscp(&unused);
} while (tscTicksEnd - tscTicksBegin < 1000000000);
uint64_t coarseEndUs = CoarseTimestampUs();
// Compute the TSC frequency and round to nearest 100MHz.
uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000;
uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin;
return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs;
}
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef ROC_BANDWIDTH_TEST_MYTIME_H_
#define ROC_BANDWIDTH_TEST_MYTIME_H_
// Will use AMD timer and general Linux timer based on users'
// need --> compilation flag. Support for windows platform is
// not currently available
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <x86intrin.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <string>
using namespace std;
#include <sys/time.h>
#define HSA_FAILURE 1
#define HSA_SUCCESS 0
class PerfTimer {
private:
struct Timer {
string name; /* < name name of time object*/
long long _freq; /* < _freq frequency*/
long long _clocks; /* < _clocks number of ticks at end*/
long long _start; /* < _start start point ticks*/
};
std::vector<Timer*> _timers; /*< _timers vector to Timer objects */
double freq_in_100mhz;
public:
PerfTimer();
~PerfTimer();
private:
// AMD timing method
uint64_t CoarseTimestampUs();
uint64_t MeasureTSCFreqHz();
// General Linux timing method
public:
int CreateTimer();
int StartTimer(int index);
int StopTimer(int index);
void ResetTimer(int index);
public:
// retrieve time
double ReadTimer(int index);
// write into a file
double WriteTimer(int index);
public:
void Error(string str);
};
#endif // ROC_BANDWIDTH_TEST_MYTIME_H_
......@@ -42,7 +42,6 @@
#include <unistd.h>
#include <iostream>
#include "hsatimer.hpp"
#include "rocm_bandwidth_test.hpp"
using namespace std;
......
File mode changed from 100644 to 100755
......@@ -49,8 +49,14 @@
#include <unistd.h>
#include <cctype>
#include <cmath>
#include <cstring>
#include <sstream>
#include <limits>
#include <chrono>
#include <thread>
// Initialize the variable used to capture validation failure
const double RocmBandwidthTest::VALIDATE_COPY_OP_FAILURE = std::numeric_limits<double>::max();
// The values are in megabytes at allocation time
const size_t RocmBandwidthTest::SIZE_LIST[] = { 1 * 1024,
......@@ -74,7 +80,7 @@ const size_t RocmBandwidthTest::LATENCY_SIZE_LIST[] = { 1,
256 * 1024, 512 * 1024 };
uint32_t RocmBandwidthTest::GetIterationNum() {
return (validate_) ? 1 : (num_iteration_ * 1.2 + 1);
return (validate_) ? 1 : (num_iteration_ + 1);
}
void RocmBandwidthTest::AcquireAccess(hsa_agent_t agent, void* ptr) {
......@@ -92,10 +98,12 @@ void RocmBandwidthTest::AcquirePoolAcceses(uint32_t src_dev_idx,
hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_;
if (src_dev_type == HSA_DEVICE_TYPE_GPU) {
AcquireAccess(src_agent, dst);
} else if (dst_dev_type == HSA_DEVICE_TYPE_GPU) {
}
if (dst_dev_type == HSA_DEVICE_TYPE_GPU) {
AcquireAccess(dst_agent, src);
}
return;
}
......@@ -115,19 +123,20 @@ void RocmBandwidthTest::InitializeSrcBuffer(size_t size, void* buf_cpy,
ErrorCheck(err_);
}
// If Copy device is a Gpu setup buffer access
// If copying agent is a CPU, use memcpy to initialize copy buffer
hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_;
if (cpy_dev_type == HSA_DEVICE_TYPE_GPU) {
AcquireAccess(cpy_agent, init_src_);
hsa_signal_store_relaxed(init_signal_, 1);
copy_buffer(buf_cpy, cpy_agent,
init_src_, cpu_agent_,
size, init_signal_);
if (cpy_dev_type == HSA_DEVICE_TYPE_CPU) {
std::memcpy(buf_cpy, init_src_, size);
return;
}
// Copy initialization buffer into copy buffer
memcpy(buf_cpy, init_src_, size);
// Copying device is a Gpu, setup buffer access
// before copying initialization buffer
AcquireAccess(cpy_agent, init_src_);
hsa_signal_store_relaxed(init_signal_, 1);
copy_buffer(buf_cpy, cpy_agent,
init_src_, cpu_agent_,
size, init_signal_);
return;
}
......@@ -141,7 +150,7 @@ bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, voi
}
// If Copy device is a Gpu setup buffer access
memset(validate_dst_, ~(0x23), curr_size);
std::memset(validate_dst_, ~(0x23), curr_size);
hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_;
if (cpy_dev_type == HSA_DEVICE_TYPE_GPU) {
AcquireAccess(cpy_agent, validate_dst_);
......@@ -153,11 +162,11 @@ bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, voi
// Copying device is a CPU, copy dst buffer
// into validation buffer
memcpy(validate_dst_, buf_cpy, curr_size);
std::memcpy(validate_dst_, buf_cpy, curr_size);
}
// Copy initialization buffer into copy buffer
err_ = (hsa_status_t)memcmp(init_src_, validate_dst_, curr_size);
// Compare initialization buffer with validation buffer
err_ = (hsa_status_t)std::memcmp(init_src_, validate_dst_, curr_size);
if (err_ != HSA_STATUS_SUCCESS) {
exit_value_ = err_;
}
......@@ -581,12 +590,17 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
hsa_signal_store_relaxed(signal_start_bidir, 1);
}
// Create a timer object and reset signals
PerfTimer timer;
uint32_t index = timer.CreateTimer();
// Temporary code for testing
if (sleep_time_ > 0) {
std::this_thread::sleep_for(sleep_usecs_);
}
// Create a timer object and start it
if (print_cpu_time_) {
cpu_start_ = std::chrono::steady_clock::now();
}
// Start the timer and launch forward copy operation
timer.StartTimer(index);
// Launch the copy operation
if (bidir == false) {
err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd,
buf_src_fwd, src_agent_fwd,
......@@ -615,11 +629,13 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
WaitForCopyCompletion(signal_list);
// Stop the timer object
timer.StopTimer(index);
// Push the time taken for copy into a vector of copy times
cpu_time.push_back(timer.ReadTimer(index));
// Stop the timer object and extract time taken
if (print_cpu_time_) {
cpu_end_ = std::chrono::steady_clock::now();
cpu_cp_time_ = cpu_end_ - cpu_start_;
uint64_t cpu_temp = cpu_cp_time_.count();
cpu_time.push_back(cpu_temp);
}
// Collect time from the signal(s)
if (print_cpu_time_ == false) {
......@@ -635,16 +651,25 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
}
}
// Get Cpu min copy time
trans.cpu_min_time_.push_back(GetMinTime(cpu_time));
// Get Cpu mean copy time and store to the array
trans.cpu_avg_time_.push_back(GetMeanTime(cpu_time));
// Collecting Cpu time. Capture verify failures if any
// Get min and mean copy times and collect them into Cpu
// time list
double min_time = 0;
double mean_time = 0;
if (print_cpu_time_) {
min_time = (verify) ? GetMinTime(cpu_time) : VALIDATE_COPY_OP_FAILURE;
mean_time = (verify) ? GetMeanTime(cpu_time) : VALIDATE_COPY_OP_FAILURE;
trans.cpu_min_time_.push_back(min_time);
trans.cpu_avg_time_.push_back(mean_time);
}
// Collecting Gpu time. Capture verify failures if any
// Get min and mean copy times and collect them into Gpu
// time list
if (print_cpu_time_ == false) {
if (trans.copy.uses_gpu_) {
// Get Gpu min and mean copy times
double min_time = (verify) ? GetMinTime(gpu_time) : std::numeric_limits<double>::max();
double mean_time = (verify) ? GetMeanTime(gpu_time) : std::numeric_limits<double>::max();
min_time = (verify) ? GetMinTime(gpu_time) : VALIDATE_COPY_OP_FAILURE;
mean_time = (verify) ? GetMeanTime(gpu_time) : VALIDATE_COPY_OP_FAILURE;
trans.gpu_min_time_.push_back(min_time);
trans.gpu_avg_time_.push_back(mean_time);
}
......@@ -652,7 +677,9 @@ void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) {
verify = true;
// Clear the stack of cpu times
cpu_time.clear();
if (print_cpu_time_) {
cpu_time.clear();
}
gpu_time.clear();
}
......@@ -788,19 +815,37 @@ RocmBandwidthTest::RocmBandwidthTest(int argc, char** argv) : BaseTest() {
// Initialize version of the test
version_.major_id = 2;
version_.minor_id = 3;
version_.step_id = 2;
version_.minor_id = 5;
version_.step_id = 1;
version_.reserved = 0;
// Test impact of sleep, temp code
sleep_time_ = 0;
bw_sleep_time_ = getenv("ROCM_BW_SLEEP_TIME");
if (bw_sleep_time_ != NULL) {
sleep_time_ = atoi(bw_sleep_time_);
if ((sleep_time_ < 0) || (sleep_time_ > 400000)) {
std::cout << "Unit of sleep time is defined as 10 microseconds" << std::endl;
std::cout << "An input value of 10 implies sleep time of 100 microseconds" << std::endl;
std::cout << "Value of ROCM_BW_SLEEP_TIME must be between [1, 400000]" << sleep_time_ << std::endl;
exit(1);
}
sleep_time_ *= 10;
std::chrono::microseconds temp(sleep_time_);
sleep_usecs_ = temp;
}
bw_iter_cnt_ = getenv("ROCM_BW_ITER_CNT");
bw_default_run_ = getenv("ROCM_BW_DEFAULT_RUN");
bw_blocking_run_ = getenv("ROCR_BW_RUN_BLOCKING");
skip_fine_grain_ = getenv("ROCM_SKIP_FINE_GRAINED_POOL");
skip_cpu_fine_grain_ = getenv("ROCM_SKIP_CPU_FINE_GRAINED_POOL");
skip_gpu_coarse_grain_ = getenv("ROCM_SKIP_GPU_COARSE_GRAINED_POOL");
if (bw_iter_cnt_ != NULL) {
int32_t num = atoi(bw_iter_cnt_);
if (num < 0) {
std::cout << "Value of ROCM_BW_ITER_CNT can't be negative: " << num << std::endl;
exit(1);
}
set_num_iteration(num);
}
......
......@@ -43,12 +43,12 @@
#ifndef __ROC_BANDWIDTH_TEST_H__
#define __ROC_BANDWIDTH_TEST_H__
#include "hsa/hsa.h"
#include "hsa.h"
#include "base_test.hpp"
#include "hsatimer.hpp"
#include "common.hpp"
#include <vector>
#include <chrono>
using namespace std;
......@@ -168,13 +168,14 @@ typedef enum Request_Type {
REQ_WRITE = 2,
REQ_VERSION = 3,
REQ_TOPOLOGY = 4,
REQ_COPY_BIDIR = 5,
REQ_COPY_UNIDIR = 6,
REQ_COPY_ALL_BIDIR = 7,
REQ_COPY_ALL_UNIDIR = 8,
REQ_CONCURRENT_COPY_BIDIR = 9,
REQ_CONCURRENT_COPY_UNIDIR = 10,
REQ_INVALID = 11,
REQ_LIST_DEVS = 5,
REQ_COPY_BIDIR = 6,
REQ_COPY_UNIDIR = 7,
REQ_COPY_ALL_BIDIR = 8,
REQ_COPY_ALL_UNIDIR = 9,
REQ_CONCURRENT_COPY_BIDIR = 10,
REQ_CONCURRENT_COPY_UNIDIR = 11,
REQ_INVALID = 12,
} Request_Type;
......@@ -446,6 +447,7 @@ class RocmBandwidthTest : public BaseTest {
uint32_t req_write_;
uint32_t req_version_;
uint32_t req_topology_;
uint32_t req_list_devs_;
uint32_t req_copy_bidir_;
uint32_t req_copy_unidir_;
uint32_t req_copy_all_bidir_;
......@@ -472,6 +474,9 @@ class RocmBandwidthTest : public BaseTest {
static const uint32_t LINK_PROP_TYPE = 0x01;
static const uint32_t LINK_PROP_WEIGHT = 0x02;
static const uint32_t LINK_PROP_ACCESS = 0x03;
// Encodes validation failure
static const double VALIDATE_COPY_OP_FAILURE;
// List used to store transactions per user request
vector<async_trans_t> trans_list_;
......@@ -488,7 +493,8 @@ class RocmBandwidthTest : public BaseTest {
// Env key to determine if Fine-grained or
// Coarse-grained pool should be filtered out
char* skip_fine_grain_;
char* skip_cpu_fine_grain_;
char* skip_gpu_coarse_grain_;
// Env key to determine if the run should block
// or actively wait on completion signal
......@@ -499,10 +505,12 @@ class RocmBandwidthTest : public BaseTest {
// Env key to specify iteration count
char* bw_iter_cnt_;
// Variable to store argument number
// Variable to store argument number
char* bw_sleep_time_;
uint32_t sleep_time_;
std::chrono::nanoseconds cpu_cp_time_;
std::chrono::microseconds sleep_usecs_;
std::chrono::time_point<std::chrono::steady_clock> cpu_end_;
std::chrono::time_point<std::chrono::steady_clock> cpu_start_;
// Variable to store argument number
uint32_t usr_argc_;
......
File mode changed from 100644 to 100755
......@@ -97,7 +97,7 @@ static bool ParseOptionValue(char* value, vector<size_t>&value_list) {
// Read the option value
stream >> token;
if (stream.fail()) {
exit(-1);
return false;
}
// Update output list with values
......@@ -147,9 +147,7 @@ void RocmBandwidthTest::ValidateCopyUnidirFlags(uint32_t copy_mask,
// It is illegal to specify Latency and another
// secondary flag that affects a copy operation
if ((copy_ctrl_mask & DEV_COPY_LATENCY) &&
((copy_ctrl_mask & USR_BUFFER_INIT) ||
(copy_ctrl_mask & CPU_VISIBLE_TIME) ||
(copy_ctrl_mask & VALIDATE_COPY_OP))) {
(copy_ctrl_mask & VALIDATE_COPY_OP)) {
PrintHelpScreen();
exit(0);
}
......@@ -224,6 +222,12 @@ void RocmBandwidthTest::ValidateInputFlags(uint32_t pf_cnt,
return;
}
// Input is requesting to print list of devices
// rocm_bandwidth_test -e
if (req_list_devs_ == REQ_LIST_DEVS) {
return;
}
// Input is for bidirectional bandwidth for some devices
// rocm_bandwidth_test -b
if (req_copy_bidir_ == REQ_COPY_BIDIR) {
......@@ -338,7 +342,7 @@ void RocmBandwidthTest::ParseArguments() {
int opt;
bool status;
while ((opt = getopt(usr_argc_, usr_argv_, "hqtclvaAb:i:s:d:r:w:m:k:K:")) != -1) {
while ((opt = getopt(usr_argc_, usr_argv_, "hqteclvaAb:i:s:d:r:w:m:k:K:")) != -1) {
switch (opt) {
// Print help screen
......@@ -352,6 +356,12 @@ void RocmBandwidthTest::ParseArguments() {
req_version_ = REQ_VERSION;
break;
// Print list of devices
case 'e':
num_primary_flags++;
req_list_devs_ = REQ_LIST_DEVS;
break;
// Print system topology
case 't':
num_primary_flags++;
......@@ -426,6 +436,7 @@ void RocmBandwidthTest::ParseArguments() {
status = ParseOptionValue(optarg, size_list_);
if (status == false) {
print_help = true;
break;
}
copy_ctrl_mask |= USR_BUFFER_SIZE;
break;
......@@ -510,7 +521,14 @@ void RocmBandwidthTest::ParseArguments() {
// Discover the topology of RocR agent in system
DiscoverTopology();
// Print system topology if user option has "-t"
// Print list of devices if user option is "-e"
if (req_list_devs_ == REQ_LIST_DEVS) {
PrintVersion();
PrintTopology();
exit(0);
}
// Print system topology if user option is "-t"
if (req_topology_ == REQ_TOPOLOGY) {
PrintVersion();
PrintTopology();
......
......@@ -59,6 +59,7 @@ void RocmBandwidthTest::PrintHelpScreen() {
std::cout << "\t -v Run the test in validation mode" << std::endl;
std::cout << "\t -l Run test to collect Latency data" << std::endl;
std::cout << "\t -c Time the operation using CPU Timers" << std::endl;
std::cout << "\t -e Prints the list of ROCm devices enabled on platform" << std::endl;
std::cout << "\t -i Initialize copy buffer with specified 'long double' pattern" << std::endl;
std::cout << "\t -t Prints system topology and allocatable memory info" << std::endl;
std::cout << "\t -m List of buffer sizes to use, specified in Megabytes" << std::endl;
......@@ -70,13 +71,10 @@ void RocmBandwidthTest::PrintHelpScreen() {
std::cout << std::endl;
std::cout << "\t NOTE: Mixing following options is illegal/unsupported" << std::endl;
std::cout << "\t\t Case 1: rocm_bandwidth_test -a or -A with -c" << std::endl;
std::cout << "\t\t Case 2: rocm_bandwidth_test -b or -A with -m" << std::endl;
std::cout << "\t\t Case 3: rocm_bandwidth_test -b or -A with -l" << std::endl;
std::cout << "\t\t Case 4: rocm_bandwidth_test -b or -A with -v" << std::endl;
std::cout << "\t\t Case 5: rocm_bandwidth_test -a or -s x -d y with -l and -c" << std::endl;
std::cout << "\t\t Case 6: rocm_bandwidth_test -a or -s x -d y with -l and -m" << std::endl;
std::cout << "\t\t Case 7: rocm_bandwidth_test -a or -s x -d y with -l and -v" << std::endl;
std::cout << "\t\t Case 1: rocm_bandwidth_test -a with {lm}{1,}" << std::endl;
std::cout << "\t\t Case 2: rocm_bandwidth_test -b with {clv}{1,}" << std::endl;
std::cout << "\t\t Case 3: rocm_bandwidth_test -A with {clmv}{1,}" << std::endl;
std::cout << "\t\t Case 4: rocm_bandwidth_test -s x -d y with {lmv}{2,}" << std::endl;
std::cout << std::endl;
std::cout << std::endl;
......@@ -152,11 +150,19 @@ void RocmBandwidthTest::PrintTopology() {
if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) {
std::cout << " Device Type: CPU" << std::endl;
std::cout.width(format);
std::cout << "";
std::cout.width(format);
std::cout << " Device Name: " << node.agent.name_ << std::endl;
} else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) {
std::cout << " Device Type: GPU" << std::endl;
std::cout.width(format);
std::cout << "";
std::cout.width(format);
std::cout << " Device Name: " << node.agent.name_ << std::endl;
std::cout.width(format);
std::cout << "";
std::cout.width(format);
std::cout << " Device BDF: " << node.agent.bdf_id_ << std::endl;
}
......@@ -204,6 +210,7 @@ std::string GetValueAsString(uint32_t key, uint32_t value) {
}
std::cout << "An illegal key to get value for" << std::endl;
assert(false);
return "";
}
void RocmBandwidthTest::PrintLinkPropsMatrix(uint32_t key) const {
......
......@@ -61,7 +61,7 @@ static void printRecord(size_t size, double avg_time,
}
uint32_t format = 15;
std::cout.precision(6);
std::cout.precision(3);
std::cout << std::fixed;
std::cout.width(format);
std::cout << size_str.str();
......@@ -134,10 +134,9 @@ double RocmBandwidthTest::GetMeanTime(std::vector<double>& vec) {
return vec.at(0);
}
// Number of elements is ONE plus number of iterations
std::sort(vec.begin(), vec.end());
vec.erase(vec.begin());
vec.erase(vec.begin(), vec.begin() + num_iteration_ * 0.1);
vec.erase(vec.begin() + num_iteration_, vec.end());
vec.erase(vec.end() - 1);
double mean = 0.0;
int num = vec.size();
......@@ -287,7 +286,7 @@ void RocmBandwidthTest::PrintPerfMatrix(bool validate, bool peak, double* perf_m
std::cout << std::endl;
std::cout << std::endl;
std::cout.precision(6);
std::cout.precision(3);
std::cout << std::fixed;
std::cout.width(format);
......@@ -318,7 +317,7 @@ void RocmBandwidthTest::PrintPerfMatrix(bool validate, bool peak, double* perf_m
if (validate) {
if (value == 0) {
std::cout << "N/A";
} else if (value < 1) {
} else if (value == VALIDATE_COPY_OP_FAILURE) {
std::cout << "FAIL";
} else {
std::cout << "PASS";
......
......@@ -46,6 +46,7 @@
#include <iomanip>
#include <sstream>
#include <string>
#include <cstring>
// @brief: Helper method to iterate throught the memory pools of
// an agent and discover its properties
......@@ -106,10 +107,11 @@ hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) {
}
// Consult user request and add either fine-grained or
// coarse-grained memory pools if agent is CPU
// coarse-grained memory pools if agent is CPU. Default
// is to skip coarse-grained memory pools
agent_info_t& agent_info = asyncDrvr->agent_list_.back();
if (agent_info.device_type_ == HSA_DEVICE_TYPE_CPU) {
if (asyncDrvr->skip_fine_grain_ != NULL) {
if (asyncDrvr->skip_cpu_fine_grain_ != NULL) {
if (is_fine_grained == true) {
return HSA_STATUS_SUCCESS;
}
......@@ -119,9 +121,21 @@ hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) {
}
}
}
// hsa_device_type_t device_type;
// status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
// ErrorCheck(status);
// Consult user request and add either fine-grained or
// coarse-grained memory pools if agent is GPU. Default
// is to skip fine-grained memory pools
if (agent_info.device_type_ == HSA_DEVICE_TYPE_GPU) {
if (asyncDrvr->skip_gpu_coarse_grain_ != NULL) {
if (is_fine_grained == false) {
return HSA_STATUS_SUCCESS;
}
} else {
if (is_fine_grained == true) {
return HSA_STATUS_SUCCESS;
}
}
}
// Create an instance of agent_pool_info and add it to the list
pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool,
......@@ -145,7 +159,7 @@ void PopulateBDF(uint32_t bdf_id, agent_info_t *agent_info) {
std::stringstream stream;
stream << std::setfill('0') << std::setw(sizeof(uint8_t) * 2);
stream << std::hex << +bus_id << ":" << +dev_id << "." << +func_id;
strcpy(agent_info->bdf_id_, (stream.str()).c_str());
std::strcpy(agent_info->bdf_id_, (stream.str()).c_str());
}
// @brief: Helper method to iterate throught the agents of
......@@ -231,7 +245,7 @@ void RocmBandwidthTest::PopulateAccessMatrix() {
uint32_t path;
path = (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) ? 0 : 1;
direct_access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx] = path;
if ((src_dev_type == HSA_DEVICE_TYPE_CPU) &&
(dst_dev_type == HSA_DEVICE_TYPE_GPU) &&
(access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)) {
......@@ -321,7 +335,7 @@ void RocmBandwidthTest::BindLinkProps(uint32_t idx1, uint32_t idx2) {
hsa_amd_memory_pool_link_info_t *link_info;
uint32_t link_info_sz = hops * sizeof(hsa_amd_memory_pool_link_info_t);
link_info = (hsa_amd_memory_pool_link_info_t *)malloc(link_info_sz);
memset(link_info, 0, (hops * sizeof(hsa_amd_memory_pool_link_info_t)));
std::memset(link_info, 0, (hops * sizeof(hsa_amd_memory_pool_link_info_t)));
err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info);
......
......@@ -130,7 +130,7 @@ bool RocmBandwidthTest::FilterCpuPool(uint32_t req_type,
// If env to skip fine grain is NULL it means
// we should filter out coarse-grain pools
if (skip_fine_grain_ == NULL) {
if (skip_cpu_fine_grain_ == NULL) {
return (fine_grained == false);
}
......@@ -394,7 +394,7 @@ void RocmBandwidthTest::ComputeCopyTime(std::vector<async_trans_t>& trans_list)
}
void RocmBandwidthTest::ComputeCopyTime(async_trans_t& trans) {
// Get the frequency of Gpu Timestamping
uint64_t sys_freq = 0;
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sys_freq);
......@@ -418,25 +418,49 @@ void RocmBandwidthTest::ComputeCopyTime(async_trans_t& trans) {
data_size += data_size;
}
// Copy operation does not involve a Gpu device
// Divide bandwidth with 10^9 to get size in GigaBytes (10^9)
if (trans.copy.uses_gpu_ != true) {
// Get time taken by copy operation. Adjust time from nanoseconds
// to units of seconds
if ((print_cpu_time_) ||
(trans.copy.uses_gpu_ != true)) {
avg_time = trans.cpu_avg_time_[idx];
min_time = trans.cpu_min_time_[idx];
avg_bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000;
peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000;
avg_time = avg_time / 1000 / 1000 / 1000;
min_time = min_time / 1000 / 1000 / 1000;
} else {
avg_time = trans.gpu_avg_time_[idx];
min_time = trans.gpu_min_time_[idx];
}
// Determine if there was a validation failure
// @note: Value is set to VALIDATE_COPY_OP_FAILURE
// if copy transaction wa validated and it failed
hsa_status_t verify_status = HSA_STATUS_ERROR;
if ((avg_time != VALIDATE_COPY_OP_FAILURE) &&
(min_time != VALIDATE_COPY_OP_FAILURE)) {
verify_status = HSA_STATUS_SUCCESS;
}
// Adjust Gpu time if there is no validation error
if ((trans.copy.uses_gpu_) &&
(print_cpu_time_ == false) &&
(verify_status == HSA_STATUS_SUCCESS)) {
avg_time = avg_time / sys_freq;
min_time = min_time / sys_freq;
}
// Compute bandwidth - divide bandwidth with
// 10^9 not 1024^3 to get size in GigaBytes
// @note: For validation failures bandwidth
// is encoded by VALIDATE_COPY_OP_FAILURE
if (verify_status != HSA_STATUS_SUCCESS) {
avg_bandwidth = VALIDATE_COPY_OP_FAILURE;
peak_bandwidth = VALIDATE_COPY_OP_FAILURE;
} else {
if (print_cpu_time_ == false) {
avg_time = trans.gpu_avg_time_[idx] / sys_freq;
min_time = trans.gpu_min_time_[idx] / sys_freq;
} else {
avg_time = trans.cpu_avg_time_[idx];
min_time = trans.cpu_min_time_[idx];
}
avg_bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000;
peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000;
}
// Update computed bandwidth for the transaction
trans.min_time_.push_back(min_time);
trans.avg_time_.push_back(avg_time);
trans.avg_bandwidth_.push_back(avg_bandwidth);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment